diff --git a/projects/rocprofiler-compute/.github/CODEOWNERS b/projects/rocprofiler-compute/.github/CODEOWNERS index f2e47c5c3a..56098b9c73 100644 --- a/projects/rocprofiler-compute/.github/CODEOWNERS +++ b/projects/rocprofiler-compute/.github/CODEOWNERS @@ -1,6 +1,7 @@ * @koomie @coleramos425 # Documentation files -docs/* @ROCm/rocm-documentation +docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation +.readthedocs.yaml @ROCm/rocm-documentation diff --git a/projects/rocprofiler-compute/.github/workflows/dependabot.yml b/projects/rocprofiler-compute/.github/workflows/dependabot.yml new file mode 100644 index 0000000000..48d6228bad --- /dev/null +++ b/projects/rocprofiler-compute/.github/workflows/dependabot.yml @@ -0,0 +1,18 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/docs/sphinx" # Location of package manifests + open-pull-requests-limit: 10 + schedule: + interval: "daily" + target-branch: "dev" + labels: + - "documentation" + - "dependencies" + reviewers: + - "samjwu" diff --git a/projects/rocprofiler-compute/.github/workflows/docs.yml b/projects/rocprofiler-compute/.github/workflows/docs.yml index 843537adab..7b5f30bbcc 100644 --- a/projects/rocprofiler-compute/.github/workflows/docs.yml +++ b/projects/rocprofiler-compute/.github/workflows/docs.yml @@ -4,10 +4,9 @@ on: push: branches: ["main"] paths: - - 'src/docs' - - 'src/archive/docs-1.x' + - 'docs/archive/docs-2.x/**' + - 'docs/archive/docs-1.x/**' - '.github/workflows/docs.yml' - - 'VERSION' workflow_dispatch: @@ -31,24 +30,24 @@ jobs: - name: Checkout uses: actions/checkout@v4 - name: Additional python packages - run: pip3 install -r requirements-doc.txt + run: pip3 install -r docs/archive/requirements-doc.txt - name: Setup Pages uses: actions/configure-pages@v4 - name: Build 1.x docs run: | - cd src/archive/docs-1.x + cd docs/archive/docs-1.x make html - - name: Build current docs + - name: Build 2.x docs run: | - cd src/docs + cd docs/archive/docs-2.x make html - name: Relocate 1.x docs run: | - mv src/archive/docs-1.x/_build/html src/docs/_build/html/1.x + mv docs/archive/docs-1.x/_build/html docs/archive/_build/html/1.x - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: - path: ./src/docs/_build/html + path: ./docs/archive/_build/html # Deployment job deploy: diff --git a/projects/rocprofiler-compute/.gitignore b/projects/rocprofiler-compute/.gitignore index 4d6df9d13e..3b3a34d40e 100644 --- a/projects/rocprofiler-compute/.gitignore +++ b/projects/rocprofiler-compute/.gitignore @@ -19,3 +19,8 @@ VERSION.sha # temp files /tests/Testing + +# documentation artifacts +/_build +_toc.yml + diff --git a/projects/rocprofiler-compute/.readthedocs.yaml b/projects/rocprofiler-compute/.readthedocs.yaml new file mode 100644 index 0000000000..ed04e0a35d --- /dev/null +++ b/projects/rocprofiler-compute/.readthedocs.yaml @@ -0,0 +1,13 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +python: + install: + - requirements: docs/sphinx/requirements.txt diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 7c7a69595f..9a1632a112 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -189,46 +189,51 @@ message(STATUS "Pytest CPU threadcount: ${PYTEST_NUMPROCS}") add_test( NAME test_profile_kernel_execution - COMMAND ${Python3_EXECUTABLE} -m pytest -m kernel_execution --junitxml=tests/test_profile_kernel_execution.xml - ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND + ${Python3_EXECUTABLE} -m pytest -m kernel_execution + --junitxml=tests/test_profile_kernel_execution.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) add_test( NAME test_profile_ipblocks - COMMAND ${Python3_EXECUTABLE} -m pytest -m block --junitxml=tests/test_profile_blocks.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND + ${Python3_EXECUTABLE} -m pytest -m block --junitxml=tests/test_profile_blocks.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) set_property(TEST test_profile_ipblocks PROPERTY COST 11) add_test( NAME test_profile_dispatch - COMMAND ${Python3_EXECUTABLE} -m pytest -m dispatch --junitxml=tests/test_profile_dispatch.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND + ${Python3_EXECUTABLE} -m pytest -m dispatch + --junitxml=tests/test_profile_dispatch.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) set_property(TEST test_profile_ipblocks PROPERTY COST 5) add_test( NAME test_profile_mem - COMMAND ${Python3_EXECUTABLE} -m pytest -m mem --junitxml=tests/test_profile_mem.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND ${Python3_EXECUTABLE} -m pytest -m mem --junitxml=tests/test_profile_mem.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) add_test( NAME test_profile_join - COMMAND ${Python3_EXECUTABLE} -m pytest -m join --junitxml=tests/test_profile_join.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND ${Python3_EXECUTABLE} -m pytest -m join --junitxml=tests/test_profile_join.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) add_test( NAME test_profile_sort - COMMAND ${Python3_EXECUTABLE} -m pytest -m sort --junitxml=tests/test_profile_sort.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND ${Python3_EXECUTABLE} -m pytest -m sort --junitxml=tests/test_profile_sort.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) add_test( NAME test_profile_misc - COMMAND ${Python3_EXECUTABLE} -m pytest -m misc --junitxml=tests/test_profile_misc.xml ${COV_OPTION} - ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + COMMAND ${Python3_EXECUTABLE} -m pytest -m misc --junitxml=tests/test_profile_misc.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) set_tests_properties( @@ -247,8 +252,10 @@ set_tests_properties( add_test( NAME test_analyze_commands - COMMAND ${Python3_EXECUTABLE} -m pytest -n ${PYTEST_NUMPROCS} --junitxml=tests/test_analyze_commands.xml - ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_analyze_commands.py + COMMAND + ${Python3_EXECUTABLE} -m pytest -n ${PYTEST_NUMPROCS} + --junitxml=tests/test_analyze_commands.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_analyze_commands.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # --------------------------- @@ -257,8 +264,10 @@ add_test( add_test( NAME test_analyze_workloads - COMMAND ${Python3_EXECUTABLE} -m pytest -n ${PYTEST_NUMPROCS} --junitxml=tests/test_analyze_workloads.xml - ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_analyze_workloads.py + COMMAND + ${Python3_EXECUTABLE} -m pytest -n ${PYTEST_NUMPROCS} + --junitxml=tests/test_analyze_workloads.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_analyze_workloads.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # --------- diff --git a/projects/rocprofiler-compute/README.md b/projects/rocprofiler-compute/README.md index 3be1182bdd..cc8cc8b0d1 100644 --- a/projects/rocprofiler-compute/README.md +++ b/projects/rocprofiler-compute/README.md @@ -4,17 +4,17 @@ [![Docs](https://github.com/ROCm/omniperf/actions/workflows/docs.yml/badge.svg)](https://rocm.github.io/omniperf/) [![DOI](https://zenodo.org/badge/561919887.svg)](https://zenodo.org/badge/latestdoi/561919887) - # Omniperf ## General + Omniperf is a system performance profiling tool for machine learning/HPC workloads running on AMD MI GPUs. The tool presently targets usage on MI100, MI200, and MI300 accelerators. * For more information on available features, installation steps, and workload profiling and analysis, please refer to the online -[documentation](https://rocm.github.io/omniperf). +[documentation](https://rocm.docs.amd.com/projects/omniperf/en/latest/). * Omniperf is an AMD open source research project and is not supported as part of the ROCm software stack. We welcome contributions and diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/.gitignore b/projects/rocprofiler-compute/docs/archive/docs-1.x/.gitignore similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/.gitignore rename to projects/rocprofiler-compute/docs/archive/docs-1.x/.gitignore diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/.nojekyll b/projects/rocprofiler-compute/docs/archive/docs-1.x/.nojekyll similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/.nojekyll rename to projects/rocprofiler-compute/docs/archive/docs-1.x/.nojekyll diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/Makefile b/projects/rocprofiler-compute/docs/archive/docs-1.x/Makefile similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/Makefile rename to projects/rocprofiler-compute/docs/archive/docs-1.x/Makefile diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/README b/projects/rocprofiler-compute/docs/archive/docs-1.x/README similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/README rename to projects/rocprofiler-compute/docs/archive/docs-1.x/README diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/VERSION b/projects/rocprofiler-compute/docs/archive/docs-1.x/VERSION similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/VERSION rename to projects/rocprofiler-compute/docs/archive/docs-1.x/VERSION diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/analysis.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/analysis.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/analysis.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/analysis.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/conf.py b/projects/rocprofiler-compute/docs/archive/docs-1.x/conf.py similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/conf.py rename to projects/rocprofiler-compute/docs/archive/docs-1.x/conf.py diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/faq.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/faq.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/faq.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/faq.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/getting_started.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/getting_started.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/high_level_design.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/high_level_design.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/high_level_design.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/high_level_design.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Arithmetic_operations.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Arithmetic_operations.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Arithmetic_operations.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Arithmetic_operations.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Command_processor.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Command_processor.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Command_processor.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Command_processor.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Comp_pipe_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Comp_pipe_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Comp_pipe_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Comp_pipe_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Compute_pipeline_stats.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Compute_pipeline_stats.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Compute_pipeline_stats.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Compute_pipeline_stats.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Constant_cache_l2_interface.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Constant_cache_l2_interface.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Constant_cache_l2_interface.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Constant_cache_l2_interface.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Constant_cache_stats.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Constant_cache_stats.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Constant_cache_stats.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Constant_cache_stats.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Current_and_baseline_dispatch_ids.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Current_and_baseline_dispatch_ids.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Current_and_baseline_dispatch_ids.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Current_and_baseline_dispatch_ids.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruc_cache_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruc_cache_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruc_cache_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruc_cache_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruction_cache_stats.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruction_cache_stats.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruction_cache_stats.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruction_cache_stats.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruction_mix.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruction_mix.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Instruction_mix.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Instruction_mix.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Kernel_time_histogram.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Kernel_time_histogram.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Kernel_time_histogram.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Kernel_time_histogram.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L1D_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1D_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L1D_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1D_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_cache_stalls.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_cache_stalls.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_cache_stalls.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_cache_stalls.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_l2_transactions.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_l2_transactions.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_l2_transactions.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_l2_transactions.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_l2_transactions_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_l2_transactions_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_l2_transactions_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_l2_transactions_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_utcl1_transactions.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_utcl1_transactions.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L1_utcl1_transactions.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L1_utcl1_transactions.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_cache_accesses.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_cache_accesses.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_cache_accesses.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_cache_accesses.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_cache_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_cache_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_cache_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_cache_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_latencies_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_latencies_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_latencies_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_latencies_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_stalls.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_stalls.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_stalls.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_stalls.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_stalls_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_stalls_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_stalls_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_stalls_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_transactions.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_transactions.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_transactions.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_transactions.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_transactions_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_transactions_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_transactions_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_transactions_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_write_stalls_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_write_stalls_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_write_stalls_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_write_stalls_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_write_starvation_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_write_starvation_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/L2_ea_write_starvation_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/L2_ea_write_starvation_per_channel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/LDS_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/LDS_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/LDS_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/LDS_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/LDS_stats.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/LDS_stats.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/LDS_stats.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/LDS_stats.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/MFMA_arithmetic_instruction_mix.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/MFMA_arithmetic_instruction_mix.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/MFMA_arithmetic_instruction_mix.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/MFMA_arithmetic_instruction_mix.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Memory_chart_analysis.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Memory_chart_analysis.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Memory_chart_analysis.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Memory_chart_analysis.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Memory_latencies.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Memory_latencies.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Memory_latencies.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Memory_latencies.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Roofline_analysis.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Roofline_analysis.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Roofline_analysis.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Roofline_analysis.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Shader_processing_input.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Shader_processing_input.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Shader_processing_input.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Shader_processing_input.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/System_info_panel.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/System_info_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/System_info_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/System_info_panel.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/System_speed_of_light.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/System_speed_of_light.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/System_speed_of_light.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/System_speed_of_light.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Texture_address.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Texture_address.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Texture_address.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Texture_address.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Texture_data.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Texture_data.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Texture_data.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Texture_data.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Top_bottleneck_dispatches.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Top_bottleneck_dispatches.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Top_bottleneck_dispatches.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Top_bottleneck_dispatches.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Top_bottleneck_kernels.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Top_bottleneck_kernels.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Top_bottleneck_kernels.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Top_bottleneck_kernels.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/VALU_arithmetic_instruction_mix.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/VALU_arithmetic_instruction_mix.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/VALU_arithmetic_instruction_mix.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/VALU_arithmetic_instruction_mix.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/VMEM_arithmetic_intensity_mix.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/VMEM_arithmetic_intensity_mix.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/VMEM_arithmetic_intensity_mix.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/VMEM_arithmetic_intensity_mix.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Vec_L1D_cache_accesses.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Vec_L1D_cache_accesses.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Vec_L1D_cache_accesses.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Vec_L1D_cache_accesses.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Vec_L1D_cache_sol.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Vec_L1D_cache_sol.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Vec_L1D_cache_sol.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Vec_L1D_cache_sol.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/Wavefront_launch.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/Wavefront_launch.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/Wavefront_launch.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/Wavefront_launch.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/datasource_config.jpg b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/datasource_config.jpg similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/datasource_config.jpg rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/datasource_config.jpg diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/datasource_settings.jpg b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/datasource_settings.jpg similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/datasource_settings.jpg rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/datasource_settings.jpg diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/global_variables.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/global_variables.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/global_variables.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/global_variables.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/grafana_welcome.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/grafana_welcome.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/grafana_welcome.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/grafana_welcome.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/grafana_workload_selection.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/grafana_workload_selection.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/grafana_workload_selection.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/grafana_workload_selection.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/import_dashboard.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/import_dashboard.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/import_dashboard.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/import_dashboard.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/install_decision_tree.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/install_decision_tree.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/install_decision_tree.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/install_decision_tree.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/omniperf_architecture.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/omniperf_architecture.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/omniperf_architecture.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/omniperf_architecture.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/omniperf_server_vs_client_install.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/omniperf_server_vs_client_install.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/omniperf_server_vs_client_install.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/omniperf_server_vs_client_install.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/opening_dashboard.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/opening_dashboard.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/opening_dashboard.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/opening_dashboard.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/sample-roof-plot.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/sample-roof-plot.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/sample-roof-plot.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/sample-roof-plot.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/standalone_gui.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/standalone_gui.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/standalone_gui.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/standalone_gui.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo1.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo1.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo1.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo1.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo2.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo2.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo2.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo2.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo3.png b/projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo3.png similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/images/tunnel_demo3.png rename to projects/rocprofiler-compute/docs/archive/docs-1.x/images/tunnel_demo3.png diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/index.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/index.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/index.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/index.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/installation.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/installation.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/installation.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/installation.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/introduction.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/introduction.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/introduction.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/introduction.md diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/make.bat b/projects/rocprofiler-compute/docs/archive/docs-1.x/make.bat similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/make.bat rename to projects/rocprofiler-compute/docs/archive/docs-1.x/make.bat diff --git a/projects/rocprofiler-compute/src/archive/docs-1.x/profiling.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md similarity index 100% rename from projects/rocprofiler-compute/src/archive/docs-1.x/profiling.md rename to projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md diff --git a/projects/rocprofiler-compute/src/docs/.gitignore b/projects/rocprofiler-compute/docs/archive/docs-2.x/.gitignore similarity index 100% rename from projects/rocprofiler-compute/src/docs/.gitignore rename to projects/rocprofiler-compute/docs/archive/docs-2.x/.gitignore diff --git a/projects/rocprofiler-compute/src/docs/.nojekyll b/projects/rocprofiler-compute/docs/archive/docs-2.x/.nojekyll similarity index 100% rename from projects/rocprofiler-compute/src/docs/.nojekyll rename to projects/rocprofiler-compute/docs/archive/docs-2.x/.nojekyll diff --git a/projects/rocprofiler-compute/src/docs/Makefile b/projects/rocprofiler-compute/docs/archive/docs-2.x/Makefile similarity index 94% rename from projects/rocprofiler-compute/src/docs/Makefile rename to projects/rocprofiler-compute/docs/archive/docs-2.x/Makefile index 41c270bb32..c3854a5224 100644 --- a/projects/rocprofiler-compute/src/docs/Makefile +++ b/projects/rocprofiler-compute/docs/archive/docs-2.x/Makefile @@ -6,7 +6,7 @@ SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . -BUILDDIR = _build +BUILDDIR = ../_build # Put it first so that "make" without argument is like "make help". help: @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/projects/rocprofiler-compute/src/docs/README b/projects/rocprofiler-compute/docs/archive/docs-2.x/README similarity index 100% rename from projects/rocprofiler-compute/src/docs/README rename to projects/rocprofiler-compute/docs/archive/docs-2.x/README diff --git a/projects/rocprofiler-compute/docs/archive/docs-2.x/VERSION b/projects/rocprofiler-compute/docs/archive/docs-2.x/VERSION new file mode 100644 index 0000000000..38f77a65b3 --- /dev/null +++ b/projects/rocprofiler-compute/docs/archive/docs-2.x/VERSION @@ -0,0 +1 @@ +2.0.1 diff --git a/projects/rocprofiler-compute/src/docs/_static/css/custom.css b/projects/rocprofiler-compute/docs/archive/docs-2.x/_static/css/custom.css similarity index 100% rename from projects/rocprofiler-compute/src/docs/_static/css/custom.css rename to projects/rocprofiler-compute/docs/archive/docs-2.x/_static/css/custom.css diff --git a/projects/rocprofiler-compute/src/docs/analysis.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/analysis.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/analysis.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/analysis.md diff --git a/projects/rocprofiler-compute/src/docs/conf.py b/projects/rocprofiler-compute/docs/archive/docs-2.x/conf.py similarity index 98% rename from projects/rocprofiler-compute/src/docs/conf.py rename to projects/rocprofiler-compute/docs/archive/docs-2.x/conf.py index 9cedb65a5c..b8ff5a0059 100644 --- a/projects/rocprofiler-compute/src/docs/conf.py +++ b/projects/rocprofiler-compute/docs/archive/docs-2.x/conf.py @@ -20,8 +20,8 @@ sys.path.insert(0, os.path.abspath("..")) repo_version = "unknown" # Determine short version by file in repo -if os.path.isfile("../../VERSION"): - with open("../../VERSION") as f: +if os.path.isfile("./VERSION"): + with open("./VERSION") as f: repo_version = f.readline().strip() diff --git a/projects/rocprofiler-compute/src/docs/faq.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/faq.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/faq.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/faq.md diff --git a/projects/rocprofiler-compute/src/docs/getting_started.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/getting_started.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md diff --git a/projects/rocprofiler-compute/src/docs/high_level_design.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/high_level_design.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/high_level_design.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/high_level_design.md diff --git a/projects/rocprofiler-compute/src/docs/images/Current_and_baseline_dispatch_ids.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Current_and_baseline_dispatch_ids.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Current_and_baseline_dispatch_ids.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Current_and_baseline_dispatch_ids.png diff --git a/projects/rocprofiler-compute/src/docs/images/Kernel_time_histogram.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Kernel_time_histogram.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Kernel_time_histogram.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Kernel_time_histogram.png diff --git a/projects/rocprofiler-compute/src/docs/images/L1_l2_transactions_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/L1_l2_transactions_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/L1_l2_transactions_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/L1_l2_transactions_per_channel.png diff --git a/projects/rocprofiler-compute/src/docs/images/L2_ea_latencies_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_latencies_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/L2_ea_latencies_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_latencies_per_channel.png diff --git a/projects/rocprofiler-compute/src/docs/images/L2_ea_stalls_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_stalls_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/L2_ea_stalls_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_stalls_per_channel.png diff --git a/projects/rocprofiler-compute/src/docs/images/L2_ea_write_stalls_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_write_stalls_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/L2_ea_write_stalls_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_write_stalls_per_channel.png diff --git a/projects/rocprofiler-compute/src/docs/images/L2_ea_write_starvation_per_channel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_write_starvation_per_channel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/L2_ea_write_starvation_per_channel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/L2_ea_write_starvation_per_channel.png diff --git a/projects/rocprofiler-compute/src/docs/images/Memory_latencies.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Memory_latencies.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Memory_latencies.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Memory_latencies.png diff --git a/projects/rocprofiler-compute/src/docs/images/Roofline_analysis.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Roofline_analysis.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Roofline_analysis.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Roofline_analysis.png diff --git a/projects/rocprofiler-compute/src/docs/images/Top_bottleneck_dispatches.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Top_bottleneck_dispatches.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Top_bottleneck_dispatches.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Top_bottleneck_dispatches.png diff --git a/projects/rocprofiler-compute/src/docs/images/Top_bottleneck_kernels.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/Top_bottleneck_kernels.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/Top_bottleneck_kernels.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/Top_bottleneck_kernels.png diff --git a/projects/rocprofiler-compute/src/docs/images/amd-header-logo.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/amd-header-logo.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/amd-header-logo.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/amd-header-logo.svg diff --git a/projects/rocprofiler-compute/src/docs/images/cpc_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cpc_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cpc_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cpc_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cpf_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cpf_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cpf_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cpf_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-arith-ops_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-arith-ops_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-arith-ops_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-arith-ops_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-inst-mix_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-inst-mix_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-inst-mix_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-inst-mix_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-mafma-arith-instr-mix_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-mafma-arith-instr-mix_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-mafma-arith-instr-mix_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-mafma-arith-instr-mix_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-pipeline-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-pipeline-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-pipeline-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-pipeline-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-value-arith-instr-mix_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-value-arith-instr-mix_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-value-arith-instr-mix_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-value-arith-instr-mix_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/cu-vmem-instr-mix_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-vmem-instr-mix_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/cu-vmem-instr-mix_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/cu-vmem-instr-mix_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/datasource_config.jpg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/datasource_config.jpg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/datasource_config.jpg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/datasource_config.jpg diff --git a/projects/rocprofiler-compute/src/docs/images/datasource_settings.jpg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/datasource_settings.jpg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/datasource_settings.jpg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/datasource_settings.jpg diff --git a/projects/rocprofiler-compute/src/docs/images/fabric.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/fabric.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/fabric.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/fabric.png diff --git a/projects/rocprofiler-compute/src/docs/images/fabric.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/fabric.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/fabric.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/fabric.svg diff --git a/projects/rocprofiler-compute/src/docs/images/fig_level_counter.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/fig_level_counter.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/fig_level_counter.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/fig_level_counter.png diff --git a/projects/rocprofiler-compute/src/docs/images/gcn_compute_unit.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/gcn_compute_unit.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/gcn_compute_unit.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/gcn_compute_unit.png diff --git a/projects/rocprofiler-compute/src/docs/images/global_variables.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/global_variables.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/global_variables.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/global_variables.png diff --git a/projects/rocprofiler-compute/src/docs/images/grafana_welcome.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/grafana_welcome.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/grafana_welcome.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/grafana_welcome.png diff --git a/projects/rocprofiler-compute/src/docs/images/grafana_workload_selection.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/grafana_workload_selection.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/grafana_workload_selection.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/grafana_workload_selection.png diff --git a/projects/rocprofiler-compute/src/docs/images/import_dashboard.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/import_dashboard.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/import_dashboard.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/import_dashboard.png diff --git a/projects/rocprofiler-compute/src/docs/images/install_decision_tree.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/install_decision_tree.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/install_decision_tree.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/install_decision_tree.png diff --git a/projects/rocprofiler-compute/src/docs/images/instr-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/instr-cache-accesses_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/instr-cache-accesses_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/instr-cache-accesses_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/instr-cache-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/instr-cache-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/instr-cache-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/instr-cache-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/l1perf_model.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l1perf_model.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l1perf_model.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l1perf_model.png diff --git a/projects/rocprofiler-compute/src/docs/images/l1perf_model.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l1perf_model.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l1perf_model.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l1perf_model.svg diff --git a/projects/rocprofiler-compute/src/docs/images/l2-accesses_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-accesses_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l2-accesses_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-accesses_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/l2-fabric-interface-stalls_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-fabric-interface-stalls_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l2-fabric-interface-stalls_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-fabric-interface-stalls_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/l2-fabric-transactions_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-fabric-transactions_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l2-fabric-transactions_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-fabric-transactions_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/l2-per-channel-agg-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-per-channel-agg-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l2-per-channel-agg-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-per-channel-agg-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/l2-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/l2-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/l2-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/lds-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/lds-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/lds-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/lds-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/lds.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/lds.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds.png diff --git a/projects/rocprofiler-compute/src/docs/images/lds.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/lds.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/lds.svg diff --git a/projects/rocprofiler-compute/src/docs/images/ldsbandwidth.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsbandwidth.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsbandwidth.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsbandwidth.png diff --git a/projects/rocprofiler-compute/src/docs/images/ldsbandwidth.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsbandwidth.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsbandwidth.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsbandwidth.svg diff --git a/projects/rocprofiler-compute/src/docs/images/ldsconflictrate.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflictrate.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsconflictrate.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflictrate.png diff --git a/projects/rocprofiler-compute/src/docs/images/ldsconflictrate.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflictrate.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsconflictrate.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflictrate.svg diff --git a/projects/rocprofiler-compute/src/docs/images/ldsconflicts.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflicts.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsconflicts.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflicts.png diff --git a/projects/rocprofiler-compute/src/docs/images/ldsconflicts.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflicts.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ldsconflicts.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ldsconflicts.svg diff --git a/projects/rocprofiler-compute/src/docs/images/memory-chart_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/memory-chart_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/memory-chart_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/memory-chart_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/nosplit.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/nosplit.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/nosplit.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/nosplit.png diff --git a/projects/rocprofiler-compute/src/docs/images/nosplit.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/nosplit.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/nosplit.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/nosplit.svg diff --git a/projects/rocprofiler-compute/src/docs/images/omniperf_architecture.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/omniperf_architecture.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/omniperf_architecture.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/omniperf_architecture.png diff --git a/projects/rocprofiler-compute/src/docs/images/omniperf_server_vs_client_install.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/omniperf_server_vs_client_install.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/omniperf_server_vs_client_install.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/omniperf_server_vs_client_install.png diff --git a/projects/rocprofiler-compute/src/docs/images/opening_dashboard.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/opening_dashboard.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/opening_dashboard.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/opening_dashboard.png diff --git a/projects/rocprofiler-compute/src/docs/images/roofline_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/roofline_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/roofline_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/roofline_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/sample-roof-plot.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/sample-roof-plot.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/sample-roof-plot.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/sample-roof-plot.png diff --git a/projects/rocprofiler-compute/src/docs/images/selayout.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/selayout.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/selayout.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/selayout.png diff --git a/projects/rocprofiler-compute/src/docs/images/sl1d-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-cache-accesses_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/sl1d-cache-accesses_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-cache-accesses_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/sl1d-l12-interface_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-l12-interface_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/sl1d-l12-interface_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-l12-interface_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/sl1d-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/sl1d-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/sl1d-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/spi-resource-allocation_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/spi-resource-allocation_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/spi-resource-allocation_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/spi-resource-allocation_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/spi-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/spi-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/spi-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/spi-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/split.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/split.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/split.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/split.png diff --git a/projects/rocprofiler-compute/src/docs/images/split.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/split.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/split.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/split.svg diff --git a/projects/rocprofiler-compute/src/docs/images/standalone_gui.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/standalone_gui.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/standalone_gui.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/standalone_gui.png diff --git a/projects/rocprofiler-compute/src/docs/images/system-info_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/system-info_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/system-info_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/system-info_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/ta_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/ta_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/ta_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/ta_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/td_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/td_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/td_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/td_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/top-stat_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/top-stat_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/top-stat_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/top-stat_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/tunnel_demo1.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo1.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/tunnel_demo1.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo1.png diff --git a/projects/rocprofiler-compute/src/docs/images/tunnel_demo2.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo2.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/tunnel_demo2.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo2.png diff --git a/projects/rocprofiler-compute/src/docs/images/tunnel_demo3.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo3.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/tunnel_demo3.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/tunnel_demo3.png diff --git a/projects/rocprofiler-compute/src/docs/images/uncached.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/uncached.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/uncached.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/uncached.png diff --git a/projects/rocprofiler-compute/src/docs/images/uncached.svg b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/uncached.svg similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/uncached.svg rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/uncached.svg diff --git a/projects/rocprofiler-compute/src/docs/images/vl1d-addr-translation_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-addr-translation_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/vl1d-addr-translation_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-addr-translation_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/vl1d-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-cache-accesses_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/vl1d-cache-accesses_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-cache-accesses_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/vl1d-cache-stalls_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-cache-stalls_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/vl1d-cache-stalls_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-cache-stalls_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/vl1d-l2-transactions_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-l2-transactions_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/vl1d-l2-transactions_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-l2-transactions_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/vl1d-sol_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-sol_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/vl1d-sol_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/vl1d-sol_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/wavefront-launch-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/wavefront-launch-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/wavefront-launch-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/wavefront-launch-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/images/wavefront-runtime-stats_panel.png b/projects/rocprofiler-compute/docs/archive/docs-2.x/images/wavefront-runtime-stats_panel.png similarity index 100% rename from projects/rocprofiler-compute/src/docs/images/wavefront-runtime-stats_panel.png rename to projects/rocprofiler-compute/docs/archive/docs-2.x/images/wavefront-runtime-stats_panel.png diff --git a/projects/rocprofiler-compute/src/docs/index.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/index.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/index.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/index.md diff --git a/projects/rocprofiler-compute/src/docs/installation.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/installation.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/installation.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/installation.md diff --git a/projects/rocprofiler-compute/src/docs/introduction.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/introduction.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/introduction.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/introduction.md diff --git a/projects/rocprofiler-compute/src/docs/make.bat b/projects/rocprofiler-compute/docs/archive/docs-2.x/make.bat similarity index 100% rename from projects/rocprofiler-compute/src/docs/make.bat rename to projects/rocprofiler-compute/docs/archive/docs-2.x/make.bat diff --git a/projects/rocprofiler-compute/src/docs/performance_model.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/performance_model.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/performance_model.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/performance_model.md diff --git a/projects/rocprofiler-compute/src/docs/profiling.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md similarity index 100% rename from projects/rocprofiler-compute/src/docs/profiling.md rename to projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md diff --git a/projects/rocprofiler-compute/requirements-doc.txt b/projects/rocprofiler-compute/docs/archive/requirements-doc.txt similarity index 100% rename from projects/rocprofiler-compute/requirements-doc.txt rename to projects/rocprofiler-compute/docs/archive/requirements-doc.txt diff --git a/projects/rocprofiler-compute/docs/conceptual/command-processor.rst b/projects/rocprofiler-compute/docs/conceptual/command-processor.rst new file mode 100644 index 0000000000..a055768a1f --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/command-processor.rst @@ -0,0 +1,154 @@ +.. meta:: + :description: Omniperf performance model: Command processor (CP) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, command, processor, fetcher, packet processor, CPF, CPC + +********************** +Command processor (CP) +********************** + +The command processor (CP) is responsible for interacting with the AMDGPU kernel +driver -- the Linux kernel -- on the CPU and for interacting with user-space +HSA clients when they submit commands to HSA queues. Basic tasks of the CP +include reading commands (such as, corresponding to a kernel launch) out of +:hsa-runtime-pdf:`HSA queues <68>`, scheduling work to subsequent parts of the +scheduler pipeline, and marking kernels complete for synchronization events on +the host. + +The command processor consists of two sub-components: + +* :ref:`Fetcher ` (CPF): Fetches commands out of memory to hand + them over to the CPC for processing. + +* :ref:`Packet processor ` (CPC): Micro-controller running the + command processing firmware that decodes the fetched commands and (for + kernels) passes them to the :ref:`workgroup processors ` for + scheduling. + +Before scheduling work to the accelerator, the command processor can +first acquire a memory fence to ensure system consistency +(:hsa-runtime-pdf:`Section 2.6.4 <91>`). After the work is complete, the +command processor can apply a memory-release fence. Depending on the AMD CDNA™ +accelerator under question, either of these operations *might* initiate a cache +write-back or invalidation. + +Analyzing command processor performance is most interesting for kernels +that you suspect to be limited by scheduling or launch rate. The command +processor’s metrics therefore are focused on reporting, for example: + +* Utilization of the fetcher + +* Utilization of the packet processor, and decoding processing packets + +* Stalls in fetching and processing + +.. _cpf-metrics: + +Command processor fetcher (CPF) +=============================== + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - CPF Utilization + + - Percent of total cycles where the CPF was busy actively doing any work. + The ratio of CPF busy cycles over total cycles counted by the CPF. + + - Percent + + * - CPF Stall + + - Percent of CPF busy cycles where the CPF was stalled for any reason. + + - Percent + + * - CPF-L2 Utilization + + - Percent of total cycles counted by the CPF-:doc:`L2 ` interface + where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 + busy cycles over total cycles counted by the CPF-L2. + + - Percent + + * - CPF-L2 Stall + + - Percent of CPF-:doc:`L2 ` L2 busy cycles where the CPF-L2 + interface was stalled for any reason. + + - Percent + + * - CPF-UTCL1 Stall + + - Percent of CPF busy cycles where the CPF was stalled by address + translation. + + - Percent + +.. _cpc-metrics: + +Command processor packet processor (CPC) +======================================== + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - CPC Utilization + + - Percent of total cycles where the CPC was busy actively doing any work. + The ratio of CPC busy cycles over total cycles counted by the CPC. + + - Percent + + * - CPC Stall + + - Percent of CPC busy cycles where the CPC was stalled for any reason. + + - Percent + + * - CPC Packet Decoding Utilization + + - Percent of CPC busy cycles spent decoding commands for processing. + + - Percent + + * - CPC-Workgroup Manager Utilization + + - Percent of CPC busy cycles spent dispatching workgroups to the + :ref:`workgroup manager `. + + - Percent + + * - CPC-L2 Utilization + + - Percent of total cycles counted by the CPC-:doc:`L2 ` interface + where the CPC-L2 interface was active doing any work. + + - Percent + + * - CPC-UTCL1 Stall + + - Percent of CPC busy cycles where the CPC was stalled by address + translation. + + - Percent + + * - CPC-UTCL2 Utilization + + - Percent of total cycles counted by the CPC's :doc:`L2 ` address + translation interface where the CPC was busy doing address translation + work. + + - Percent + diff --git a/projects/rocprofiler-compute/docs/conceptual/compute-unit.rst b/projects/rocprofiler-compute/docs/conceptual/compute-unit.rst new file mode 100644 index 0000000000..e7061c814e --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/compute-unit.rst @@ -0,0 +1,60 @@ +.. meta:: + :description: Omniperf performance model: Compute unit (CU) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, GCN, compute, unit, pipeline, workgroup, wavefront, + CDNA + +***************** +Compute unit (CU) +***************** + +The compute unit (CU) is responsible for executing a user's kernels on +CDNA™-based accelerators. All :ref:`wavefronts ` of a +:ref:`workgroup ` are scheduled on the same CU. + +.. image:: ../data/performance-model/gcn_compute_unit.png + :align: center + :alt: AMD CDNA accelerator compute unit diagram + :width: 800 + +The CU consists of several independent execution pipelines and functional units. +The :doc:`/conceptual/pipeline-descriptions` section details the various +execution pipelines -- VALU, SALU, LDS, scheduler, and so forth. The metrics +presented by Omniperf for these pipelines are described in +:doc:`pipeline-metrics`. The :doc:`vL1D ` cache and +:doc:`LDS ` are described in their own sections. + +* The :ref:`desc-valu` is composed of multiple SIMD (single + instruction, multiple data) vector processors, vector general purpose + registers (VGPRs) and instruction buffers. The VALU is responsible for + executing much of the computational work on CDNA accelerators, including but + not limited to floating-point operations (FLOPs) and integer operations + (IOPs). + +* The vector memory (VMEM) unit is responsible for issuing loads, stores and + atomic operations that interact with the memory system. + +* The :ref:`desc-salu` is shared by all threads in a + :ref:`wavefront `, and is responsible for executing + instructions that are known to be uniform across the wavefront at compile + time. The SALU has a memory unit (SMEM) for interacting with memory, but it + cannot issue separately from the SALU. + +* The :doc:`local-data-share` is an on-CU software-managed scratchpad memory + that can be used to efficiently share data between all threads in a + :ref:`workgroup `. + +* The :ref:`desc-scheduler` is responsible for issuing and decoding instructions + for all the :ref:`wavefronts ` on the compute unit. + +* The :doc:`vector L1 data cache (vL1D) ` is the first level + cache local to the compute unit. On current CDNA accelerators, the vL1D is + write-through. The vL1D caches from multiple compute units are kept coherent + with one another through software instructions. + +* CDNA accelerators -- that is, AMD Instinct™ MI100 and newer -- contain + specialized matrix-multiplication accelerator pipelines known as the + :ref:`desc-mfma`. + +For a more in-depth description of a compute unit on a CDNA accelerator, see +:hip-training-pdf:`22` and :gcn-crash-course:`27`. + diff --git a/projects/rocprofiler-compute/docs/conceptual/definitions.rst b/projects/rocprofiler-compute/docs/conceptual/definitions.rst new file mode 100644 index 0000000000..8ad483094a --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/definitions.rst @@ -0,0 +1,152 @@ +.. meta:: + :description: Omniperf terminology and definitions + :keywords: Omniperf, ROCm, glossary, definitions, terms, profiler, tool, + Instinct, accelerator, AMD + +*********** +Definitions +*********** + +The following table briefly defines some terminology used in Omniperf interfaces +and in this documentation. + +.. include:: ./includes/terms.rst + +.. include:: ./includes/normalization-units.rst + +.. _memory-spaces: + +Memory spaces +============= + +AMD Instinct™ MI-series accelerators can access memory through multiple address spaces +which may map to different physical memory locations on the system. The +following table provides a view into how various types of memory used +in HIP map onto these constructs: + +.. list-table:: + :header-rows: 1 + + * - LLVM Address Space + - Hardware Memory Space + - HIP Terminology + + * - Generic + - Flat + - N/A + + * - Global + - Global + - Global + + * - Local + - LDS + - LDS/Shared + + * - Private + - Scratch + - Private + + * - Constant + - Same as global + - Constant + +The following is a high-level description of the address spaces in the AMDGPU +backend of LLVM: + +.. list-table:: + :header-rows: 1 + + * - Address space + - Description + + * - Global + - Memory that can be seen by all threads in a process, and may be backed by + the local accelerator's HBM, a remote accelerator's HBM, or the CPU's + DRAM. + + * - Local + - Memory that is only visible to a particular workgroup. On AMD's Instinct + accelerator hardware, this is stored in :doc:`LDS ` + memory. + + * - Private + - Memory that is only visible to a particular [work-item](workitem) + (thread), stored in the scratch space on AMD's Instinct accelerators. + + * - Constant + - Read-only memory that is in the global address space and stored on the + local accelerator's HBM. + + * - Generic + - Used when the compiler cannot statically prove that a pointer is + addressing memory in a single (non-generic) address space. Mapped to Flat + on AMD's Instinct accelerators, the pointer could dynamically address + global, local, private or constant memory. + +`LLVM's documentation for AMDGPU Backend `_ +has the most up-to-date information. Refer to this source for a more complete +explanation. + +.. _memory-type: + +Memory type +=========== + +AMD Instinct accelerators contain a number of different memory allocation +types to enable the HIP language's +:doc:`memory coherency model `. +These memory types are broadly similar between AMD Instinct accelerator +generations, but may differ in exact implementation. + +In addition, these memory types *might* differ between accelerators on the same +system, even when accessing the same memory allocation. + +For example, an :ref:`MI2XX ` accelerator accessing *fine-grained* +memory allocated local to that device may see the allocation as coherently +cacheable, while a remote accelerator might see the same allocation as +*uncached*. + +These memory types include: + +.. list-table:: + :header-rows: 1 + + * - Memory type + - Description + + * - Uncached Memory (UC) + - Memory that will not be cached in this accelerator. On + :ref:`MI2XX ` accelerators, this corresponds “fine-grained” + (or, “coherent”) memory allocated on a remote accelerator or the host, + for example, using ``hipHostMalloc`` or ``hipMallocManaged`` with default + allocation flags. + + * - Non-hardware-Coherent Memory (NC) + - Memory that will be cached by the accelerator, and is only guaranteed to + be consistent at kernel boundaries / after software-driven + synchronization events. On :ref:`MI2XX ` accelerators, this + type of memory maps to, for example, “coarse-grained” ``hipHostMalloc``’d + memory -- that is, allocated with the ``hipHostMallocNonCoherent`` + flag -- or ``hipMalloc``’d memory allocated on a remote accelerator. + + * - Coherently Cachable (CC) + - Memory for which only reads from the accelerator where the memory was + allocated will be cached. Writes to CC memory are uncached, and trigger + invalidations of any line within this accelerator. On + :ref:`MI2XX ` accelerators, this type of memory maps to + “fine-grained” memory allocated on the local accelerator using, for + example, the ``hipExtMallocWithFlags`` API using the + ``hipDeviceMallocFinegrained`` flag. + + * - Read/Write Coherent Memory (RW) + - Memory that will be cached by the accelerator, but may be invalidated by + writes from remote devices at kernel boundaries / after software-driven + synchronization events. On :ref:`MI2XX ` accelerators, this + corresponds to “coarse-grained” memory allocated locally to the + accelerator, using for example, the default ``hipMalloc`` allocator. + +Find a good discussion of coarse and fine-grained memory allocations and what +type of memory is returned by various combinations of memory allocators, flags +and arguments in the +`Crusher quick-start guide `_. diff --git a/projects/rocprofiler-compute/docs/conceptual/includes/normalization-units.rst b/projects/rocprofiler-compute/docs/conceptual/includes/normalization-units.rst new file mode 100644 index 0000000000..34961f7e0a --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/includes/normalization-units.rst @@ -0,0 +1,47 @@ +.. _normalization-units: + +Normalization units +=================== + +A user-configurable unit by which you can choose to normalize data. Options +include: + +.. list-table:: + :header-rows: 1 + + * - Name + - Description + + * - ``per_wave`` + - The total value of the measured counter or metric that occurred per + kernel invocation divided by the total number of + :ref:`wavefronts ` launched in the kernel. + + * - ``per_cycle`` + - The total value of the measured counter or metric that occurred per + kernel invocation divided by the + :ref:`kernel cycles `, that is, the total number of + cycles the kernel executed as measured by the + :doc:`command processor `. + + * - ``per_kernel`` + - The total value of the measured counter or metric that occurred per + kernel invocation. + + * - ``per_second`` + - The total value of the measured counter or metric that occurred per + kernel invocation divided by the :ref:`kernel time `, + that is, the total runtime of the kernel in seconds, as measured by the + :doc:`command processor `. + +By default, Omniperf uses the ``per_wave`` normalization. + +.. tip:: + + The best normalization may vary depending on your use case. For instance, a + ``per_second`` normalization might be useful for FLOP or bandwidth + comparisons, while a ``per_wave`` normalization could be useful to see how many + (and what types) of instructions are used per wavefront. A ``per_kernel`` + normalization can be useful to get the total aggregate values of metrics for + comparison between different configurations. + diff --git a/projects/rocprofiler-compute/docs/conceptual/includes/terms.rst b/projects/rocprofiler-compute/docs/conceptual/includes/terms.rst new file mode 100644 index 0000000000..bc0080bdb9 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/includes/terms.rst @@ -0,0 +1,188 @@ +.. _desc-workgroup: + +.. _desc-work-item: + +.. _desc-wavefront: + +.. _desc-divergence: + +.. _kernel-time: + +.. _kernel-cycles: + +.. _total-active-cu-cycles: + +.. _total-cu-cycles: + +.. _total-se-cycles: + +.. _total-simd-cycles: + +.. _total-pipe-cycles: + +.. _total-l1i-cycles: + +.. _total-active-l2-cycles: + +.. _total-l2-cycles: + +.. _total-sl1d-cycles: + +.. _thread-requests: + +.. list-table:: + :header-rows: 1 + + * - Name + + - Description + + - Unit + + * - Kernel time + + - The number of seconds the accelerator was executing a kernel, from the + :doc:`command processor `'s (CP) start-of-kernel + timestamp (a number of cycles after the CP beings processing the packet) + to the CP's end-of-kernel timestamp (a number of cycles before the CP + stops processing the packet). + + - Seconds + + * - Kernel cycles + + - The number of cycles the accelerator was active doing *any* work, as + measured by the :doc:`command processor ` (CP). + + - Cycles + + * - Total CU cycles + + - The number of cycles the accelerator was active doing *any* work + (that is, kernel cycles), multiplied by the number of + :doc:`compute units ` on the accelerator. A + measure of the total possible active cycles the compute units could be + doing work, useful for the normalization of metrics inside the CU. + + - Cycles + + * - Total active CU cycles + + - The number of cycles a CU on the accelerator was active doing *any* + work, summed over all :doc:`compute units ` on the + accelerator. + + - Cycles + + * - Total SIMD cycles + + - The number of cycles the accelerator was active doing *any* work (that + is, kernel cycles), multiplied by the number of + :doc:`SIMDs ` on the accelerator. A measure of the + total possible active cycles the SIMDs could be doing work, useful for + the normalization of metrics inside the CU. + + - Cycles + + * - Total L2 cycles + + - The number of cycles the accelerator was active doing *any* work (that + is, kernel cycles), multiplied by the number of :doc:`L2 ` + channels on the accelerator. A measure of the total possible active + cycles the L2 channels could be doing work, useful for the normalization + of metrics inside the L2. + + - Cycles + + * - Total active L2 cycles + + - The number of cycles a channel of the L2 cache was active doing *any* + work, summed over all :doc:`L2 ` channels on the accelerator. + + - Cycles + + * - Total sL1D cycles + + - The number of cycles the accelerator was active doing *any* work (that + is, kernel cycles), multiplied by the number of + :ref:`scalar L1 data caches ` on the accelerator. A measure of + the total possible active cycles the sL1Ds could be doing work, useful + for the normalization of metrics inside the sL1D. + + - Cycles + + * - Total L1I cycles + + - The number of cycles the accelerator was active doing *any* work (that + is, kernel cycles), multiplied by the number of + :ref:`L1 instruction caches ` (L1I) on the accelerator. A + measure of the total possible active cycles the L1Is could be doing + work, useful for the normalization of metrics inside the L1I. + + - Cycles + + * - Total scheduler-pipe cycles + + - The number of cycles the accelerator was active doing *any* work (that + is, kernel cycles), multiplied by the number of + :doc:`scheduler pipes ` on the accelerator. A measure + of the total possible active cycles the scheduler-pipes could be doing + work, useful for the normalization of metrics inside the + :ref:`workgroup manager ` and + :doc:`command processor `. + + - Cycles + + * - Total shader-engine cycles + + - The total number of cycles the accelerator was active doing *any* work, + multiplied by the number of :doc:`shader engines ` on the + accelerator. A measure of the total possible active cycles the shader + engines could be doing work, useful for the normalization of + metrics inside the :ref:`workgroup manager `. + + - Cycles + + * - Thread-requests + + - The number of unique memory addresses accessed by a single memory + instruction. On AMD Instinct accelerators, this has a maximum of 64 + (that is, the size of the :ref:`wavefront `). + + - Addresses + + * - Work-item + + - A single *thread*, or lane, of execution that executes in lockstep with + the rest of the work-items comprising a :ref:`wavefront ` + of execution. + + - N/A + + * - Wavefront + + - A group of work-items, or threads, that execute in lockstep on the + :doc:`compute unit `. On AMD Instinct accelerators, the + wavefront size is always 64 work-items. + + - N/A + + * - Workgroup + + - A group of wavefronts that execute on the same + :doc:`compute unit `, and can cooperatively execute and + share data via the use of synchronization primitives, + :doc:`LDS `, atomics, and others. + + - N/A + + * - Divergence + + - Divergence within a wavefront occurs when not all work-items are active + when executing an instruction, that is, due to non-uniform control flow + within a wavefront. Can reduce execution efficiency by causing, + for instance, the :ref:`VALU ` to need to execute both + branches of a conditional with different sets of work-items active. + + - N/A + diff --git a/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst b/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst new file mode 100644 index 0000000000..2c4b44514d --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst @@ -0,0 +1,776 @@ +.. meta:: + :description: Omniperf performance model: L2 cache (TCC) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, L2, cache, infinity fabric, metrics + +************** +L2 cache (TCC) +************** + +The L2 cache is the coherence point for current AMD Instinct™ MI-series GCN™ +GPUs and CDNA™ accelerators, and is shared by all :doc:`CUs ` +on the device. Besides serving requests from the +:doc:`vector L1 data caches `, the L2 cache also is responsible +for servicing requests from the :ref:`L1 instruction caches `, the +:ref:`scalar L1 data caches ` and the +:doc:`command processor `. The L2 cache is composed of a +number of distinct channels (32 on MI100 and :ref:`MI2XX ` series CDNA +accelerators at 256B address interleaving) which can largely operate +independently. Mapping of incoming requests to a specific L2 channel is +determined by a hashing mechanism that attempts to evenly distribute requests +across the L2 channels. Requests that miss in the L2 cache are passed out to +:ref:`Infinity Fabric™ ` to be routed to the appropriate memory +location. + +The L2 cache metrics reported by Omniperf are broken down into four +categories: + +* :ref:`L2 Speed-of-Light ` + +* :ref:`L2 cache accesses ` + +* :ref:`L2-Fabric transactions ` + +* :ref:`L2-Fabric stalls ` + +.. _l2-sol: + +L2 Speed-of-Light +================= + +.. warning:: + + The theoretical maximum throughput for some metrics in this section + are currently computed with the maximum achievable clock frequency, as + reported by ``rocminfo``, for an accelerator. This may not be realistic for + all workloads. + +The L2 cache’s speed-of-light table contains a few key metrics about the +performance of the L2 cache, aggregated over all the L2 channels, as a +comparison with the peak achievable values of those metrics: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Utilization + + - The ratio of the + :ref:`number of cycles an L2 channel was active, summed over all L2 channels on the accelerator ` + over the :ref:`total L2 cycles `. + + - Percent + + * - Bandwidth + + - The number of bytes looked up in the L2 cache, as a percent of the peak + theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + e.g., if only a single value is requested in a cache line, the data + movement will still be counted as a full cache line. + + - Percent + + * - Hit Rate + + - The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 + cache. + + - Percent + + * - L2-Fabric Read BW + + - The number of bytes read by the L2 over the + :ref:`Infinity Fabric interface ` per unit time. + + - GB/s + + * - L2-Fabric Write and Atomic BW + + - The number of bytes sent by the L2 over the + :ref:`Infinity Fabric interface ` by write and atomic + operations per unit time. + + - GB/s + +.. note:: + + The L2 cache on AMD Instinct MI CDNA accelerators uses a "hit-on-miss" + approach to reporting cache hits. That is, if while satisfying a miss, + another request comes in that would hit on the same pending cache line, the + subsequent request will be counted as a 'hit'. Therefore, it is also + important to consider the latency metric in the :ref:`L2-Fabric ` + section when evaluating the L2 hit rate. + +.. _l2-cache-accesses: + +L2 cache accesses +================= + +This section details the incoming requests to the L2 cache from the +:doc:`vL1D ` and other clients -- for instance, the +:ref:`sL1D ` and :ref:`L1I ` caches. + +.. list-table:: + :header-rows: 1 + :widths: 13 70 17 + + * - Metric + + - Description + + - Unit + + * - Bandwidth + + - The number of bytes looked up in the L2 cache, per + :ref:`normalization unit `. The number of bytes is + calculated as the number of cache lines requested multiplied by the cache + line size. This value does not consider partial requests, so for example, + if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + + - Bytes per :ref:`normalization unit `. + + * - Requests + + - The total number of incoming requests to the L2 from all clients for all + request types, per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit `. + + * - Read Requests + + - The total number of read requests to the L2 from all clients. + + - Requests per :ref:`normalization unit ` + + * - Write Requests + + - The total number of write requests to the L2 from all clients. + + - Requests per :ref:`normalization unit ` + + * - Atomic Requests + + - The total number of atomic requests (with and without return) to the L2 + from all clients. + + - Requests per :ref:`normalization unit ` + + * - Streaming Requests + + - The total number of incoming requests to the L2 that are marked as + *streaming*. The exact meaning of this may differ depending on the + targeted accelerator, however on an :ref:`MI2XX ` this + corresponds to + `non-temporal load or stores `_. + The L2 cache attempts to evict *streaming* requests before normal + requests when the L2 is at capacity. + + - Requests per :ref:`normalization unit ` + + * - Probe Requests + + - The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an :ref:`MI2XX `, probe requests may be + generated by, for example, writes to + :ref:`fine-grained device ` memory or by writes to + :ref:`coarse-grained ` device memory. + + - Requests per :ref:`normalization unit ` + + * - Hit Rate + + - The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 + cache. + + - Percent + + * - Hits + + - The total number of requests to the L2 from all clients that hit in the + cache. As noted in the :ref:`Speed-of-Light ` section, this + includes hit-on-miss requests. + + - Requests per :ref:`normalization unit ` + + * - Misses + + - The total number of requests to the L2 from all clients that miss in the + cache. As noted in the :ref:`Speed-of-Light ` section, these do + not include hit-on-miss requests. + + - Requests per :ref:`normalization unit ` + + * - Writebacks + + - The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to + ``__threadfence_system`` or atomic built-ins) by the + :doc:`command processor `'s memory acquire/release + fences, or for other internal hardware reasons. + + - Cache lines per :ref:`normalization unit ` + + * - Writebacks (Internal) + + - The total number of L2 cache lines written back to memory for internal + hardware reasons, per :ref:`normalization unit `. + + - Cache lines per :ref:`normalization unit `. + + * - Writebacks (vL1D Req) + + - The total number of L2 cache lines written back to memory due to requests + initiated by the :doc:`vL1D cache `, per + :ref:`normalization unit `. + + - Cache lines per :ref:`normalization unit `. + + * - Evictions (Normal) + + - The total number of L2 cache lines evicted from the cache due to capacity + limits, per :ref:`normalization unit `. + + - Cache lines per :ref:`normalization unit `. + + * - Evictions (vL1D Req) + + - The total number of L2 cache lines evicted from the cache due to + invalidation requests initiated by the + :doc:`vL1D cache `, per + :ref:`normalization unit `. + + - Cache lines per :ref:`normalization unit `. + + * - Non-hardware-Coherent Requests + + - The total number of requests to the L2 to Not-hardware-Coherent (NC) + memory allocations, per :ref:`normalization unit `. + See the :ref:`memory-type` for more information. + + - Requests per :ref:`normalization unit `. + + * - Uncached Requests + + - The total number of requests to the L2 that go to Uncached (UC) memory + allocations. See the :ref:`memory-type` for more information. + + - Requests per :ref:`normalization unit `. + + * - Coherently Cached Requests + + - The total number of requests to the L2 that go to Coherently Cacheable (CC) + memory allocations. See the :ref:`memory-type` for more information. + + - Requests per :ref:`normalization unit `. + + * - Read/Write Coherent Requests + + - The total number of requests to the L2 that go to Read-Write coherent memory + (RW) allocations. See the :ref:`memory-type` for more information. + + - Requests per :ref:`normalization unit `. + +.. note:: + + All requests to the L2 are for a single cache line's worth of data. The size + of a cache line may vary depending on the accelerator, however on an AMD + Instinct CDNA2 :ref:`MI2XX ` accelerator, it is 128B, while on + an MI100, it is 64B. + +.. _l2-fabric: + +L2-Fabric transactions +====================== + +Requests/data that miss in the L2 must be routed to memory in order to +service them. The backing memory for a request may be local to this +accelerator (i.e., in the local high-bandwidth memory), in a remote +accelerator’s memory, or even in the CPU’s memory. Infinity Fabric +is responsible for routing these memory requests/data to the correct +location and returning any fetched data to the L2 cache. The +:ref:`l2-request-flow` describes the flow of these requests through +Infinity Fabric in more detail, as described by Omniperf metrics, +while :ref:`l2-request-metrics` give detailed definitions of +individual metrics. + +.. _l2-request-flow: + +Request flow +------------ + +The following is a diagram that illustrates how L2↔Fabric requests are reported +by Omniperf: + +.. figure:: ../data/performance-model/fabric.png + :align: center + :alt: L2-Fabric transaction flow on AMD Instinct MI-series accelerators + :width: 800 + + L2↔Fabric transaction flow on AMD Instinct MI-series accelerators. + + +Requests from the L2 Cache are broken down into two major categories, read +requests and write requests (at this granularity, atomic requests are treated +as writes). + +From there, these requests can additionally subdivided in a number of ways. +First, these requests may be sent across Infinity Fabric as different +transaction sizes, 32B or 64B on current CDNA accelerators. + +.. note:: + + On current CDNA accelerators, the 32B read request path is expected to be + unused and so is disconnected in the flow diagram. + +In addition, the read and write requests can be further categorized as: + +* Uncached read/write requests, for instance: for access to + :ref:`fine-grained memory ` + +* Atomic requests, for instance: for atomic updates to + :ref:`fine-grained memory ` + +* HBM read/write requests OR remote read/write requests, for instance: for + requests to the accelerator’s local HBM OR requests to a remote accelerator’s + HBM or the CPU’s DRAM + +These classifications are not necessarily *exclusive*. For example, a +write request can be classified as an atomic request to the +accelerator’s local HBM, and an uncached write request. The request-flow +diagram marks *exclusive* classifications as a splitting of the flow, +while *non-exclusive* requests do not split the flow line. For example, +a request is either a 32B Write Request OR a 64B Write request, as the +flow splits at this point: + +.. figure:: ../data/performance-model/split.* + :align: center + :alt: Splitting request flow + :width: 800 + + Splitting request flow + +However, continuing along, the same request might be an atomic request and an +uncached write request, as reflected by a non-split flow: + +.. figure:: ../data/performance-model/nosplit.* + :align: center + :alt: Non-splitting request flow + :width: 800 + + Non-splitting request flow + +Finally, we note that :ref:`uncached ` read requests (e.g., to +:ref:`fine-grained memory `) are handled specially on CDNA +accelerators, as indicated in the request flow diagram. These are +expected to be counted as a 64B Read Request, and *if* they are requests +to uncached memory (denoted by the dashed line), they will also be +counted as *two* uncached read requests (that is, the request is split): + +.. figure:: ../data/performance-model/uncached.* + :align: center + :alt: Uncached read-request splitting + :width: 800 + + Uncached read-request splitting. + +.. _l2-request-metrics: + +Metrics +------- + + The following metrics are reported for the L2-Fabric interface: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - L2-Fabric Read Bandwidth + + - The total number of bytes read by the L2 cache from Infinity Fabric per + :ref:`normalization unit `. + + - Bytes per :ref:`normalization unit `. + + * - HBM Read Traffic + + - The percent of read requests generated by the L2 cache that are routed to + the accelerator's local high-bandwidth memory (HBM). This breakdown does + not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric Read bandwidth directed to the local HBM. + + - Percent + + * - Remote Read Traffic + + - The percent of read requests generated by the L2 cache that are routed to + any memory location other than the accelerator's local high-bandwidth + memory (HBM) -- for example, the CPU's DRAM or a remote accelerator's + HBM. This breakdown does not consider the *size* of the request (meaning + that 32B and 64B requests are both counted as a single request), so this + metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. + + - Percent + + * - Uncached Read Traffic + + - The percent of read requests generated by the L2 cache that are reading + from an :ref:`uncached memory allocation `. Note, as + described in the :ref:`request flow ` section, a single + 64B read request is typically counted as two uncached read requests. So, + it is possible for the Uncached Read Traffic to reach up to 200% of the + total number of read requests. This breakdown does not consider the + *size* of the request (i.e., 32B and 64B requests are both counted as a + single request), so this metric only *approximates* the percent of the + L2-Fabric read bandwidth directed to an uncached memory location. + + - Percent + + * - L2-Fabric Write and Atomic Bandwidth + + - The total number of bytes written by the L2 over Infinity Fabric by write + and atomic operations per + :ref:`normalization unit `. Note that on current + CDNA accelerators, such as the :ref:`MI2XX `, requests are + only considered *atomic* by Infinity Fabric if they are targeted at + non-write-cacheable memory, for example, + :ref:`fine-grained memory ` allocations or + :ref:`uncached memory ` allocations on the + MI2XX. + + - Bytes per :ref:`normalization unit `. + + * - HBM Write and Atomic Traffic + + - The percent of write and atomic requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This + breakdown does not consider the *size* of the request (meaning that 32B + and 64B requests are both counted as a single request), so this metric + only *approximates* the percent of the L2-Fabric Write and Atomic + bandwidth directed to the local HBM. Note that on current CDNA + accelerators, such as the :ref:`MI2XX `, requests are only + considered *atomic* by Infinity Fabric if they are targeted at + :ref:`fine-grained memory ` allocations or + :ref:`uncached memory ` allocations. + + - Percent + + * - Remote Write and Atomic Traffic + + - The percent of read requests generated by the L2 cache that are routed to + any memory location other than the accelerator's local high-bandwidth + memory (HBM) -- for example, the CPU's DRAM or a remote accelerator's + HBM. This breakdown does not consider the *size* of the request (meaning + that 32B and 64B requests are both counted as a single request), so this + metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA + accelerators, such as the :ref:`MI2XX `, requests are only + considered *atomic* by Infinity Fabric if they are targeted at + :ref:`fine-grained memory ` allocations or + :ref:`uncached memory ` allocations. + + - Percent + + * - Atomic Traffic + + - The percent of write requests generated by the L2 cache that are atomic + requests to *any* memory location. This breakdown does not consider the + *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of + the L2-Fabric Read bandwidth directed to a remote location. Note that on + current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are + targeted at :ref:`fine-grained memory ` allocations or + :ref:`uncached memory ` allocations. + + - Percent + + * - Uncached Write and Atomic Traffic + + - The percent of write and atomic requests generated by the L2 cache that + are targeting :ref:`uncached memory allocations `. This + breakdown does not consider the *size* of the request (meaning that 32B + and 64B requests are both counted as a single request), so this metric + only *approximates* the percent of the L2-Fabric read bandwidth directed + to uncached memory allocations. + + - Percent + + * - Read Latency + + - The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + + - Cycles + + * - Write Latency + + - The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + + - Cycles + + * - Atomic Latency + + - The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + + - Cycles + + * - Read Stall + + - The ratio of the total number of cycles the L2-Fabric interface was + stalled on a read request to any destination (local HBM, remote PCIe® + connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the + :ref:`total active L2 cycles `. + + - Percent + + * - Write Stall + + - The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, + remote accelerator or CPU, PCIe connected accelerator or CPU, or remote + Infinity Fabric connected accelerator [#inf]_ or CPU) over the + :ref:`total active L2 cycles `. + + - Percent + +.. _l2-detailed-metrics: + +Detailed transaction metrics +---------------------------- + +The following metrics are available in the detailed L2-Fabric +transaction breakdown table: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - 32B Read Requests + + - The total number of L2 requests to Infinity Fabric to read 32B of data + from any memory location, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. Typically unused on CDNA + accelerators. + + - Requests per :ref:`normalization unit `. + + * - Uncached Read Requests + + - The total number of L2 requests to Infinity Fabric to read + :ref:`uncached data ` from any memory location, per + :ref:`normalization unit `. 64B requests for + uncached data are counted as two 32B uncached data requests. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - 64B Read Requests + + - The total number of L2 requests to Infinity Fabric to read 64B of data + from any memory location, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - HBM Read Requests + + - The total number of L2 requests to Infinity Fabric to read 32B or 64B of + data from the accelerator's local HBM, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - Remote Read Requests + + - The total number of L2 requests to Infinity Fabric to read 32B or 64B of + data from any source other than the accelerator's local HBM, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - 32B Write and Atomic Requests + + - The total number of L2 requests to Infinity Fabric to write or atomically + update 32B of data to any memory location, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - Uncached Write and Atomic Requests + + - The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of :ref:`uncached data `, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - 64B Write and Atomic Requests + + - The total number of L2 requests to Infinity Fabric to write or atomically + update 64B of data in any memory location, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - HBM Write and Atomic Requests + + - The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator's local HBM, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - Remote Write and Atomic Requests + + - The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in any memory location other than the + accelerator's local HBM, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. + + - Requests per :ref:`normalization unit `. + + * - Atomic Requests + + - The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per + :ref:`normalization unit `. See + :ref:`l2-request-flow` for more detail. Note that on current CDNA + accelerators, such as the :ref:`MI2XX `, requests are only + considered *atomic* by Infinity Fabric if they are targeted at + non-write-cacheable memory, such as + :ref:`fine-grained memory ` allocations or + :ref:`uncached memory ` allocations on the MI2XX. + + - Requests per :ref:`normalization unit `. + +.. _l2-fabric-stalls: + +L2-Fabric interface stalls +========================== + +When the interface between the L2 cache and Infinity Fabric becomes backed up by +requests, it may stall, preventing the L2 from issuing additional requests to +Infinity Fabric until prior requests complete. This section gives a breakdown of +what types of requests in a kernel caused a stall (like read versus write), and +to which locations -- for instance, to the accelerator’s local memory, or to +remote accelerators or CPUs. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Read - PCIe Stall + + - The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the + :ref:`total active L2 cycles `. + + - Percent + + * - Read - Infinity Fabric Stall + + - The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a + percent of the :ref:`total active L2 cycles `. + + - Percent + + * - Read - HBM Stall + + - The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the + :ref:`total active L2 cycles `. + + - Percent + + * - Write - PCIe Stall + + - The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to remote PCIe connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + + - Percent + + * - Write - Infinity Fabric Stall + + - The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to remote Infinity Fabric connected accelerators [#inf]_ + or CPUs as a percent of the + :ref:`total active L2 cycles `. + + - Percent + + * - Write - HBM Stall + + - The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to accelerator's local HBM as a percent of the + :ref:`total active L2 cycles `. + + - Percent + + * - Write - Credit Starvation + + - The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the + :ref:`total active L2 cycles `. + + - Percent + +.. warning:: + + On current CDNA accelerators and GCN GPUs, these L2↔Fabric stalls can be undercounted in some circumstances. + +.. rubric:: Footnotes + +.. [#inf] In addition to being used for on-accelerator data-traffic, AMD + `Infinity Fabric `_ + technology can be used to connect multiple accelerators to achieve advanced + peer-to-peer connectivity and enhanced bandwidths over traditional PCIe + connections. Some AMD Instinct MI-series accelerators like the MI250X + `feature coherent CPU↔accelerator connections built using AMD Infinity Fabric `_. + +.. rubric:: Disclaimer + +PCIe® is a registered trademark of PCI-SIG Corporation. + diff --git a/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst b/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst new file mode 100644 index 0000000000..c596844dce --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst @@ -0,0 +1,183 @@ +.. meta:: + :description: Omniperf performance model: Local data share (LDS) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, local, data, share, LDS + +********************** +Local data share (LDS) +********************** + +.. _lds-sol: + +LDS Speed-of-Light +================== + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for all + workloads. + +The :ref:`LDS ` speed-of-light chart shows a number of key metrics for +the LDS as a comparison with the peak achievable values of those metrics. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Utilization + + - Indicates what percent of the kernel's duration the :ref:`LDS ` + was actively executing instructions (including, but not limited to, load, + store, atomic and HIP's ``__shfl`` operations). Calculated as the ratio + of the total number of cycles LDS was active over the + :ref:`total CU cycles `. + + - Percent + + * - Access Rate + + - Indicates the percentage of SIMDs in the :ref:`VALU ` [#lds-workload]_ + actively issuing LDS instructions, averaged over the lifetime of the + kernel. Calculated as the ratio of the total number of cycles spent by + the :ref:`scheduler ` issuing :ref:`LDS ` + instructions over the + :ref:`total CU cycles `. + + - Percent + + * - Theoretical Bandwidth (% of Peak) + + - Indicates the maximum amount of bytes that *could* have been loaded from, + stored to, or atomically updated in the LDS in this kernel, as a percent + of the peak LDS bandwidth achievable. See the + :ref:`LDS bandwidth example ` for more detail. + + - Percent + + * - Bank Conflict Rate + + - Indicates the percentage of active LDS cycles that were spent servicing + bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been + required to move the same amount of data in an uncontended access. [#lds-bank-conflict]_ + + - Percent + +.. rubric:: Footnotes + +.. [#lds-workload] Here we assume the typical case where the workload evenly distributes + LDS operations over all SIMDs in a CU (that is, waves on different SIMDs are + executing similar code). For highly unbalanced workloads, where e.g., one + SIMD pair in the CU does not issue LDS instructions at all, this metric is + better interpreted as the percentage of SIMDs issuing LDS instructions on + :ref:`SIMD pairs ` that are actively using the LDS, averaged over + the lifetime of the kernel. + +.. [#lds-bank-conflict] The maximum value of the bank conflict rate is less than 100% + (specifically: 96.875%), as the first cycle in the + :ref:`LDS scheduler ` is never considered contended. + +.. _lds-stats: + +Statistics +========== + +The LDS statistics panel gives a more detailed view of the hardware: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - LDS Instructions + + - The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's ``__shfl`` instructions) executed per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Theoretical Bandwidth + + - Indicates the maximum amount of bytes that could have been loaded from, + stored to, or atomically updated in the LDS per + :ref:`normalization unit `. Does *not* take into + account the execution mask of the wavefront when the instruction was + executed. See the + :ref:`LDS bandwidth example ` for more detail. + + - Bytes per :ref:`normalization unit ` + + * - LDS Latency + + - The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + + - Cycles + + * - Bank Conflicts/Access + + - The ratio of the number of cycles spent in the + :ref:`LDS scheduler ` due to bank conflicts (as determined by + the conflict resolution hardware) to the base number of cycles that would + be spent in the LDS scheduler in a completely uncontended case. This is + the unnormalized form of the Bank Conflict Rate. + + - Conflicts/Access + + * - Index Accesses + + - The total number of cycles spent in the :ref:`LDS scheduler ` + over all operations per :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Atomic Return Cycles + + - The total number of cycles spent on LDS atomics with return per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Bank Conflicts + + - The total number of cycles spent in the :ref:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) + per :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Address Conflicts + + - The total number of cycles spent in the :ref:`LDS scheduler ` + due to address conflicts (as determined by the conflict resolution + hardware) per :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Unaligned Stall + + - The total number of cycles spent in the :ref:`LDS scheduler ` + due to stalls from non-dword aligned addresses per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Memory Violations + + - The total number of out-of-bounds accesses made to the LDS, per + :ref:`normalization unit `. This is unused and + expected to be zero in most configurations for modern CDNA™ accelerators. + + - Accesses per :ref:`normalization unit ` + diff --git a/projects/rocprofiler-compute/docs/conceptual/performance-model.rst b/projects/rocprofiler-compute/docs/conceptual/performance-model.rst new file mode 100644 index 0000000000..1a94b3ed69 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/performance-model.rst @@ -0,0 +1,49 @@ +.. meta:: + :description: Omniperf performance model + :keywords: Omniperf, ROCm, performance, model, profiler, tool, Instinct, + accelerator, AMD + +***************** +Performance model +***************** + +Omniperf makes available an extensive list of metrics to better understand +achieved application performance on AMD Instinct™ MI-series accelerators +including Graphics Core Next™ (GCN) GPUs like the AMD Instinct MI50, CDNA™ +accelerators like the MI100, and CDNA2 accelerators such as the MI250X, MI250, +and MI210. + +To best use profiling data, it's important to understand the role of various +hardware blocks of AMD Instinct accelerators. This section describes each +hardware block on the accelerator as interacted with by a software developer to +give a deeper understanding of the metrics reported by profiling data. Refer to +:doc:`/tutorial/profiling-by-example` for more practical examples and details on how +to use Omniperf to optimize your code. + +.. _mixxx-note: + +.. note:: + + In this chapter, **MI2XX** refers to any of the CDNA2 architecture-based AMD + Instinct MI250X, MI250, and MI210 accelerators interchangeably in cases + where the exact product at hand is not relevant. + + For a comparison of AMD Instinct accelerator specifications, refer to + :doc:`Hardware specifications `. For product + details, see the :prod-page:`MI250X `, + :prod-page:`MI250 `, and :prod-page:`MI210 ` + product pages. + +In this chapter, the AMD Instinct performance model used by Omniperf is divided into a handful of +key hardware blocks, each detailed in the following sections: + +* :doc:`compute-unit` + +* :doc:`l2-cache` + +* :doc:`shader-engine` + +* :doc:`command-processor` + +* :doc:`system-speed-of-light` + diff --git a/projects/rocprofiler-compute/docs/conceptual/pipeline-descriptions.rst b/projects/rocprofiler-compute/docs/conceptual/pipeline-descriptions.rst new file mode 100644 index 0000000000..9261421eb6 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/pipeline-descriptions.rst @@ -0,0 +1,299 @@ +.. meta:: + :description: Omniperf performance model: Shader engine (SE) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, pipeline, VALU, SALU, VMEM, SMEM, LDS, branch, + scheduler, MFMA, AGPRs + +********************* +Pipeline descriptions +********************* + +This section details the various execution pipelines of the +:doc:`compute unit `. + +.. _desc-valu: + +.. _desc-vmem: + +Vector arithmetic logic unit (VALU) +----------------------------------- + +The vector arithmetic logic unit (VALU) executes vector instructions +over an entire wavefront, each :ref:`work-item ` (or, +vector-lane) potentially operating on distinct data. The VALU of a CDNA™ +accelerator or GCN™ GPU typically consists of: + +* Four 16-wide SIMD processors (see :hip-training-pdf:`24` for more details). + +* Four 64 or 128 KiB VGPR files (yielding a total of 256-512 KiB total + per CU), see :ref:`AGPRs ` for more detail. + +* An instruction buffer (per-SIMD) that contains execution slots for up + to 8 wavefronts (for 32 total wavefront slots on each CU). + +* A vector memory (VMEM) unit which transfers data between VGPRs and + memory; each work-item supplies its own memory address and supplies + or receives unique data. + +* CDNA accelerators, such as the MI100 and :ref:`MI2XX `, contain + additional + :amd-lab-note:`Matrix Fused Multiply-Add (MFMA) ` + units. + +To support branching and conditionals, each wavefront in the VALU +has a distinct execution mask which determines which work-items in the +wavefront are active for the currently executing instruction. When +executing a VALU instruction, inactive work-items (according to the +current execution mask of the wavefront) do not execute the instruction +and are treated as no-ops. + +.. note:: + + On GCN GPUs and the CDNA MI100 accelerator, there are slots for up to 10 + wavefronts in the instruction buffer, but generally occupancy is limited by + other factors to 32 waves per :doc:`compute unit `. + On the CDNA2 :ref:`MI2XX ` series accelerators, there are only 8 + waveslots per-SIMD. + +.. _desc-salu: + +.. _desc-smem: + +Scalar arithmetic logic unit (SALU) +----------------------------------- + +The scalar arithmetic logic unit (SALU) executes instructions that are +shared between all work-items in a wavefront. This includes control flow +such as if/else conditionals, branches and looping pointer arithmetic, loading +common values, and more. + +The SALU consists of: + +* A scalar processor capable of various arithmetic, conditional, and + comparison (etc.) operations. See + :mi200-isa-pdf:`Chapter 5. Scalar ALU Operations <35>` + of the CDNA2 Instruction Set Architecture (ISA) Reference Guide for more + detail. + +* A 12.5 KiB Scalar General Purpose Register (SGPR) file + +* A scalar memory (SMEM) unit which transfers data between SGPRs and + memory + +Data loaded by the SMEM can be cached in the :ref:`scalar L1 data cache `, +and is typically only used for read-only, uniform accesses such as kernel +arguments, or HIP’s ``__constant__`` memory. + +.. _desc-lds: + +Local data share (LDS) +---------------------- + +The local data share (LDS, a.k.a., "shared memory") is fast on-CU scratchpad +that can be explicitly managed by software to effectively share data and to +coordinate between wavefronts in a workgroup. + +.. figure:: ../data/performance-model/lds.* + :align: center + :alt: Performance model of the local data share (LDS) on AMD Instinct + accelerators + :width: 800 + + Performance model of the local data share (LDS) on AMD Instinct MI-series + accelerators. + +Above is Omniperf's performance model of the LDS on CDNA accelerators (adapted +from :mantor-gcn-pdf:`20`). The SIMDs in the :ref:`VALU ` are +connected to the LDS in pairs (see above). Only one SIMD per pair may issue an +LDS instruction at a time, but both pairs may issue concurrently. + +On CDNA accelerators, the LDS contains 32 banks and each bank is 4B wide. +The LDS is designed such that each bank can be read from, written to, or +atomically updated every cycle, for a total throughput of 128B/clock +(:gcn-crash-course:`40`). + +On each of the two ports to the SIMDs, 64B can be sent in each direction per +cycle. So, a single wavefront, coming from one of the 2 SIMDs in a pair, can +only get back 64B/cycle (16 lanes per cycle). The input port is shared between +data and address and this can affect achieved bandwidth for different data +sizes. For example, a 64-wide store where each lane is sending a 4B value takes +8 cycles (50% peak bandwidth) while a 64-wide store where each lane is sending +a 16B value takes 20 cycles (80% peak bandwidth). + +In addition, the LDS contains conflict-resolution hardware to detect and handle +bank conflicts. A bank conflict occurs when two (or more) +:ref:`work-items ` in a :ref:`wavefront ` want +to read, write, or atomically update different addresses that map to the same +bank in the same cycle. In this case, the conflict detection hardware will +determine a new schedule such that the access is split into multiple cycles with +no conflicts in any single cycle. + +When multiple work-items want to read from the same address within a bank, the +result can be efficiently broadcasted (:gcn-crash-course:`41`). Multiple +work-items writing to the same address within a bank typically results undefined +behavior in HIP and other high-level languages, as the LDS will write the value from the +last work-item as determined by the hardware scheduler (:gcn-crash-course:`41`). +This behavior may be useful in the very specific case of storing a uniform +value. + +Relatedly, an address conflict is defined as occurring when two (or more) +work-items in a wavefront want to atomically update the same address on the same +cycle. As in a bank-conflict, this may cause additional cycles of work for the +LDS operation to complete. + +.. _desc-branch: + +Branch +------ + +The branch unit is responsible for executing jumps and branches to execute +control flow operations. +Note that Branch operations are not used for execution mask updates, but only +for “whole wavefront” control-flow changes. + +.. _desc-scheduler: + +Scheduler +--------- + +The scheduler is responsible for arbitration and issue of instructions for all +the wavefronts currently executing on the :doc:`CU `. On every +clock cycle, the scheduler: + +* Considers waves from one of the SIMD units for execution, selected in a + round-robin fashion between the SIMDs in the compute unit + +* Issues up to one instruction per wavefront on the selected SIMD + +* Issues up to one instruction per each of the instruction categories among the waves on the selected SIMD: + + * :ref:`VALU ` + + * :ref:`VMEM ` operations + + * :ref:`SALU ` / SMEM operations + + * :ref:`LDS ` + + * :ref:`Branch ` operations + +This gives a maximum of five issued Instructions Per Cycle (IPC), per-SIMD, +per-CU (:hip-training-pdf:`Introduction to AMD GPU Programming with HIP <>`, +:gcn-crash-course:`The AMD GCN Architecture - A Crash Course <>`). On CDNA +accelerators with :ref:`MFMA ` instructions, these are issued via the +:ref:`VALU `. Some of them will execute on a separate functional unit +and typically allow other :ref:`VALU ` operations to execute in their +shadow (see the :ref:`MFMA ` section for more detail). + +.. note:: + + The IPC model used by Omniperf omits the following two complications for + clarity. First, CDNA accelerators contain other execution units on the CU + that are unused for compute applications. Second, so-called "internal" + instructions (see :gcn-crash-course:`29`) are not issued to a functional + unit, and can technically cause the maximum IPC to *exceed* 5 instructions + per-cycle in special (largely unrealistic) cases. The latter issue is + discussed in more detail in the + :ref:`'internal' IPC ` example. + +.. _desc-mfma: + +Matrix fused multiply-add (MFMA) +-------------------------------- + +CDNA accelerators, such as the MI100 and :ref:`MI2XX `, contain +specialized hardware to accelerate matrix-matrix multiplications, also +known as Matrix Fused Multiply-Add (MFMA) operations. The exact +operation types and supported formats may vary by accelerator. Refer to the +:amd-lab-note:`AMD matrix cores ` +blog post on GPUOpen for a general discussion of these hardware units. +In addition, to explore the available MFMA instructions in-depth on +various AMD accelerators (including the CDNA line), we recommend the +`AMD Matrix Instruction Calculator `_: + +.. code-block:: shell + :caption: Partial snapshot of the AMD Matrix Instruction Calculator Tool + + $ ./matrix_calculator.py –architecture cdna2 –instruction v_mfma_f32_4x4x1f32 –detail-instruction + Architecture: CDNA2 + Instruction: V_MFMA_F32_4X4X1F32 + Encoding: VOP3P-MAI + VOP3P Opcode: 0x42 + VOP3P-MAI Opcode: 0x2 + Matrix Dimensions: + M: 4 + N: 4 + K: 1 + blocks: 16 + Execution statistics: + FLOPs: 512 + Execution cycles: 8 + FLOPs/CU/cycle: 256 + Can co-execute with VALU: True + VALU co-execution cycles possible: 4 + Register usage: + GPRs required for A: 1 + GPRs required for B: 1 + GPRs required for C: 4 + GPRs required for D: 4 + GPR alignment requirement: 8 bytes + +For the purposes of Omniperf, the MFMA unit is typically treated as a separate +pipeline from the :ref:`VALU `, as other VALU instructions (along +with other execution pipelines such as the :ref:`SALU `) typically can be +issued during a portion of the total duration of an MFMA operation. + +.. note:: + + The exact details of VALU and MFMA operation co-execution vary by + instruction, and can be explored in more detail via the following fields in + the + `AMD Matrix Instruction Calculator's detailed instruction information `_: + + * ``Can co-execute with VALU`` + + * ``VALU co-execution cycles possible`` + + +Non-pipeline resources +---------------------- + +In this section, we describe a few resources that are not standalone +pipelines but are important for understanding performance optimization +on CDNA accelerators. + +.. _desc-barrier: + +Barrier +^^^^^^^ + +Barriers are resources on the compute-unit of a CDNA accelerator that +are used to implement synchronization primitives (for example, HIP’s +``__syncthreads``). Barriers are allocated to any workgroup that +consists of more than a single wavefront. + +.. _desc-agprs: + +Accumulation vector general-purpose registers (AGPRs) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Accumulation vector general-purpose registers, or AGPRs, are special +resources that are accessible to a subset of instructions focused on +:ref:`MFMA ` operations. These registers allow the MFMA +unit to access more than the normal maximum of 256 architected +:ref:`vector general-purpose registers (VGPRs) ` by having up to 256 +in the architected space and up to 256 in the accumulation space. +Traditional VALU instructions can only use VGPRs in the architected +space, and data can be moved to/from VGPRs↔AGPRs using specialized +instructions (``v_accvgpr_*``). These data movement instructions may be +used by the compiler to implement lower-cost register-spill/fills on +architectures with AGPRs. + +AGPRs are not available on all AMD Instinct™ accelerators. GCN GPUs, +such as the AMD Instinct MI50 had a 256 KiB VGPR file. The AMD +Instinct MI100 (CDNA) has a 2x256 KiB register file, where one half +is available as general-purpose VGPRs, and the other half is for matrix +math accumulation VGPRs (AGPRs). The AMD Instinct :ref:`MI2XX ` +(CDNA2) has a 512 KiB VGPR file per CU, where each wave can dynamically request +up to 256 KiB of VGPRs and an additional 256 KiB of AGPRs. For more information, +refer to `this comment `_. + diff --git a/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst b/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst new file mode 100644 index 0000000000..f7bb4bcdae --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst @@ -0,0 +1,909 @@ +.. meta:: + :description: Omniperf performance model: Pipeline metrics + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, pipeline, wavefront, metrics, launch, runtime + VALU, MFMA, instruction mix, FLOPs, arithmetic, operations + +**************** +Pipeline metrics +**************** + +In this section, we describe the metrics available in Omniperf to analyze the +pipelines discussed in the :doc:`pipeline-descriptions`. + +.. _wavefront: + +Wavefront +========= + +.. _wavefront-launch-stats: + +Wavefront launch stats +---------------------- + +The wavefront launch stats panel gives general information about the +kernel launch: + +.. list-table:: + :header-rows: 1 + :widths: 20 65 15 + + * - Metric + + - Description + + - Unit + + * - Grid Size + + - The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size + multiplied by the total workgroup (or, block) size. + + - :ref:`Work-items ` + + * - Workgroup Size + + - The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is + equivalent to the total block size. + + - :ref:`Work-items ` + + * - Total Wavefronts + + - The total number of wavefronts launched as part of the kernel dispatch. + On AMD Instinct™ CDNA™ accelerators and GCN™ GPUs, the wavefront size is + always 64 work-items. Thus, the total number of wavefronts should be + equivalent to the ceiling of grid size divided by 64. + + - :ref:`Wavefronts ` + + * - Saved Wavefronts + + - The total number of wavefronts saved at a context-save. See + `cwsr_enable `_. + + - :ref:`Wavefronts ` + + * - Restored Wavefronts + + - The total number of wavefronts restored from a context-save. See + `cwsr_enable `_. + + - :ref:`Wavefronts ` + + * - VGPRs + + - The number of architected vector general-purpose registers allocated for + the kernel, see :ref:`VALU `. Note: this may not exactly + match the number of VGPRs requested by the compiler due to allocation + granularity. + + - :ref:`VGPRs ` + + * - AGPRs + + - The number of accumulation vector general-purpose registers allocated for + the kernel, see :ref:`AGPRs `. Note: this may not exactly + match the number of AGPRs requested by the compiler due to allocation + granularity. + + - :ref:`AGPRs ` + + * - SGPRs + + - The number of scalar general-purpose registers allocated for the kernel, + see :ref:`SALU `. Note: this may not exactly match the number + of SGPRs requested by the compiler due to allocation granularity. + + - :ref:`SGPRs ` + + * - LDS Allocation + + - The number of bytes of :doc:`LDS ` memory (or, shared + memory) allocated for this kernel. Note: This may also be larger than + what was requested at compile time due to both allocation granularity and + dynamic per-dispatch LDS allocations. + + - Bytes per :ref:`workgroup ` + + * - Scratch Allocation + + - The number of bytes of :ref:`scratch memory ` requested + per work-item for this kernel. Scratch memory is used for stack memory + on the accelerator, as well as for register spills and restores. + + - Bytes per :ref:`work-item ` + +.. _wavefront-runtime-stats: + +Wavefront runtime stats +----------------------- + +The wavefront runtime statistics gives a high-level overview of the +execution of wavefronts in a kernel: + +.. list-table:: + :header-rows: 1 + :widths: 18 65 17 + + * - Metric + + - Description + + - Unit + + * - :ref:`Kernel time ` + + - The total duration of the executed kernel. Note: this should not be + directly compared to the wavefront cycles / timings below. + + - Nanoseconds + + * - :ref:`Kernel cycles ` + + - The total duration of the executed kernel in cycles. Note: this should + not be directly compared to the wavefront cycles / timings below. + + - Cycles + + * - Instructions per wavefront + + - The average number of instructions (of all types) executed per wavefront. + This is averaged over all wavefronts in a kernel dispatch. + + - Instructions / wavefront + + * - Wave cycles + + - The number of cycles a wavefront in the kernel dispatch spent resident on + a compute unit per :ref:`normalization unit `. This + is averaged over all wavefronts in a kernel dispatch. Note: this should + not be directly compared to the kernel cycles above. + + - Cycles per :ref:`normalization unit ` + + * - Dependency wait cycles + + - The number of cycles a wavefront in the kernel dispatch stalled waiting + on memory of any kind (e.g., instruction fetch, vector or scalar memory, + etc.) per :ref:`normalization unit `. This counter + is incremented at every cycle by *all* wavefronts on a CU stalled at a + memory operation. As such, it is most useful to get a sense of how waves + were spending their time, rather than identification of a precise limiter + because another wave could be actively executing while a wave is stalled. + The sum of this metric, Issue Wait Cycles and Active Cycles should be + equal to the total Wave Cycles metric. + + - Cycles per :ref:`normalization unit ` + + * - Issue Wait Cycles + + - The number of cycles a wavefront in the kernel dispatch was unable to + issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per + :ref:`normalization unit `. This counter is + incremented at every cycle by *all* wavefronts on a CU unable to issue an + instruction. As such, it is most useful to get a sense of how waves were + spending their time, rather than identification of a precise limiter + because another wave could be actively executing while a wave is issue + stalled. The sum of this metric, Dependency Wait Cycles and Active + Cycles should be equal to the total Wave Cycles metric. + + - Cycles per :ref:`normalization unit ` + + * - Active Cycles + + - The average number of cycles a wavefront in the kernel dispatch was + actively executing instructions per + :ref:`normalization unit `. This measurement is made + on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was + stalled. As such, it is most useful to get a sense of how waves were + spending their time, rather than identification of a precise limiter. The + sum of this metric, Issue Wait Cycles and Active Wait Cycles should be + equal to the total Wave Cycles metric. + + - Cycles per :ref:`normalization unit ` + + * - Wavefront Occupancy + + - The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for + short-running kernels (less than 1ms). + + - :ref:`Wavefronts ` + +.. note:: + + As mentioned earlier, the measurement of kernel cycles and time typically + cannot be directly compared to, for example, wave cycles. This is due to two factors: + first, the kernel cycles/timings are measured using a counter that is + impacted by scheduling overhead, this is particularly noticeable for + "short-running" kernels (less than 1ms) where scheduling overhead forms a + significant portion of the overall kernel runtime. Secondly, the wave cycles + metric is incremented per-wavefront scheduled to a SIMD every cycle whereas + the kernel cycles counter is incremented only once per-cycle when *any* + wavefront is scheduled. + +.. _instruction-mix: + +Instruction mix +=============== + +The instruction mix panel shows a breakdown of the various types of instructions +executed by the user’s kernel, and which pipelines on the +:doc:`CU ` they were executed on. In addition, Omniperf reports +further information about the breakdown of operation types for the +:ref:`VALU `, vector-memory, and :ref:`MFMA ` +instructions. + +.. note:: + + All metrics in this section count *instructions issued*, and *not* the total + number of operations executed. The values reported by these metrics will not + change regardless of the execution mask of the wavefront. Note that even if + the execution mask is identically zero (meaning that *no lanes are active*) + the instruction will still be counted, as CDNA accelerators still consider + these instructions *issued*. See + :mi200-isa-pdf:`EXECute Mask, section 3.3 of the CDNA2 ISA guide<19>` for + examples and further details. + +Overall instruction mix +----------------------- + +This panel shows the total number of each type of instruction issued to +the :doc:`various compute pipelines ` on the +:doc:`CU `. These are: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - :ref:`VALU ` instructions + + - The total number of vector arithmetic logic unit (VALU) operations + issued. These are the workhorses of the + :doc:`compute unit `, and are used to execute a wide range of + instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, + shifts, conditional evaluation, etc. + + - Instructions + + * - VMEM instructions + + - The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to + :ref:`generic, global, private and texture ` memory. + + - Instructions + + * - :doc:`LDS ` instructions + + - The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's ``__shfl`` operations. + + - Instructions + + * - :ref:`MFMA ` instructions + + - The total number of matrix fused multiply-add instructions issued. + + - Instructions + + * - :ref:`SALU ` instructions + + - The total number of scalar arithmetic logic unit (SALU) operations + issued. Typically these are used for address calculations, literal + constants, and other operations that are *provably* uniform across a + wavefront. Although scalar memory (SMEM) operations are issued by the + SALU, they are counted separately in this section. + + - Instructions + + * - SMEM instructions + + - The total number of scalar memory (SMEM) operations issued. These are + typically used for loading kernel arguments, base-pointers and loads + from HIP's ``__constant__`` memory. + + - Instructions + + * - :ref:`Branch ` instructions + + - The total number of branch operations issued. These typically consist of + jump or branch operations and are used to implement control flow. + + - Instructions + +.. note:: + + Note, as mentioned in the :ref:`desc-branch` section: branch + operations are not used for execution mask updates, but only for "whole + wavefront" control flow changes. + +.. _valu-arith-instruction-mix: + +VALU arithmetic instruction mix +------------------------------- + +.. warning:: + + Not all metrics in this section (for instance, the floating-point instruction + breakdowns) are available on CDNA accelerators older than the + :ref:`MI2XX ` series. + +This panel details the various types of vector instructions that were +issued to the :ref:`VALU `. The metrics in this section do *not* +include :ref:`MFMA ` instructions using the same precision; for +instance, the “F16-ADD” metric does not include any 16-bit floating point +additions executed as part of an MFMA instruction using the same precision. + +.. list-table:: + :header-rows: 1 + :widths: 15 65 20 + + * - Metric + + - Description + + - Unit + + * - INT32 + + - The total number of instructions operating on 32-bit integer operands + issued to the VALU per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - INT64 + + - The total number of instructions operating on 64-bit integer operands + issued to the VALU per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F16-ADD + + - The total number of addition instructions operating on 16-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F16-MUL + + - The total number of multiplication instructions operating on 16-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F16-FMA + + - The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F16-TRANS + + - The total number of transcendental instructions (e.g., `sqrt`) operating + on 16-bit floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F32-ADD + + - The total number of addition instructions operating on 32-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F32-MUL + + - The total number of multiplication instructions operating on 32-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F32-FMA + + - The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F32-TRANS + + - The total number of transcendental instructions (such as ``sqrt``) + operating on 32-bit floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F64-ADD + + - The total number of addition instructions operating on 64-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F64-MUL + + - The total number of multiplication instructions operating on 64-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F64-FMA + + - The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - F64-TRANS + + - The total number of transcendental instructions (such as `sqrt`) + operating on 64-bit floating-point operands issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Conversion + + - The total number of type conversion instructions (such as converting data + to or from F32↔F64) issued to the VALU per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + +For an example of these counters in action, refer to +:ref:`valu-arith-instruction-mix-ex`. + +.. _vmem-instruction-mix: + +VMEM instruction mix +-------------------- + +This section breaks down the types of vector memory (VMEM) instructions +that were issued. Refer to the +:ref:`Instruction Counts metrics section ` under address +processor front end of the vL1D cache for descriptions of these VMEM +instructions. + +.. _mfma-instruction-mix: + +MFMA instruction mix +-------------------- + +.. warning:: + + The metrics in this section are only available on CDNA2 + (:ref:`MI2XX `) accelerators and newer. + +This section details the types of Matrix Fused Multiply-Add +(:ref:`MFMA `) instructions that were issued. Note that +MFMA instructions are classified by the type of input data they operate on, and +*not* the data type the result is accumulated to. + +.. list-table:: + :header-rows: 1 + :widths: 25 60 17 + + * - Metric + + - Description + + - Unit + + * - MFMA-I8 Instructions + + - The total number of 8-bit integer :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - MFMA-F16 Instructions + + - The total number of 16-bit floating point :ref:`MFMA ` + instructions issued per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - MFMA-BF16 Instructions + + - The total number of 16-bit brain floating point :ref:`MFMA ` + instructions issued per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - MFMA-F32 Instructions + + - The total number of 32-bit floating-point :ref:`MFMA ` + instructions issued per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - MFMA-F64 Instructions + + - The total number of 64-bit floating-point :ref:`MFMA ` + instructions issued per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + +Compute pipeline +================ + +.. _metrics-flop-count: + +FLOP counting conventions +------------------------- + +Omniperf’s conventions for VALU FLOP counting are as follows: + +* Addition or multiplication: 1 operation + +* Transcendentals: 1 operation + +* Fused multiply-add (FMA): 2 operations + +Integer operations (IOPs) do not use this convention. They are counted +as a single operation regardless of the instruction type. + +.. note:: + + Packed operations which operate on multiple operands in the same instruction + are counted identically to the underlying instruction type. For example, the + ``v_pk_add_f32`` instruction on :ref:`MI2XX `, which performs an + add operation on two pairs of aligned 32-bit floating-point operands is + counted only as a single addition -- that is, 1 operation. + +As discussed in the :ref:`instruction-mix` section, the FLOP/IOP +metrics in this section do not take into account the execution mask of +the operation, and will report the same value even if the execution mask +is identically zero. + +For example, a FMA instruction operating on 32-bit floating-point +operands (such as ``v_fma_f32`` on a :ref:`MI2XX ` accelerator) +would be counted as 128 total FLOPs: 2 operations (due to the +instruction type) multiplied by 64 operations (because the wavefront is +composed of 64 work-items). + +.. _compute-speed-of-light: + +Compute Speed-of-Light +---------------------- + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for all + workloads. + +This section reports the number of floating-point and integer operations +executed on the :ref:`VALU ` and :ref:`MFMA ` units in +various precisions. We note that unlike the +:ref:`VALU instruction mix ` and +:ref:`MFMA instruction mix ` sections, the metrics here +are reported as FLOPs and IOPs, that is, the total number of operations +executed. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - VALU FLOPs + + - The total floating-point operations executed per second on the + :ref:`VALU `. This is also presented as a percent of the peak + theoretical FLOPs achievable on the specific accelerator. Note: this does + not include any floating-point operations from :ref:`MFMA ` + instructions. + + - GFLOPs + + * - VALU IOPs + + - The total integer operations executed per second on the + :ref:`VALU `. This is also presented as a percent of the peak + theoretical IOPs achievable on the specific accelerator. Note: this does + not include any integer operations from :ref:`MFMA ` + instructions. + + - GIOPs + + * - MFMA FLOPs (BF16) + + - The total number of 16-bit brain floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit + brain floating point operations from :ref:`VALU ` + instructions. This is also presented as a percent of the peak theoretical + BF16 MFMA operations achievable on the specific accelerator. + + - GFLOPs + + * - MFMA FLOPs (F16) + + - The total number of 16-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F16 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - MFMA FLOPs (F32) + + - The total number of 32-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 32-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F32 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - MFMA FLOPs (F64) + + - The total number of 64-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 64-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F64 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - MFMA IOPs (INT8) + + - The total number of 8-bit integer :ref:`MFMA ` operations + executed per second. Note: this does not include any 8-bit integer + operations from :ref:`VALU ` instructions. This is also + presented as a percent of the peak theoretical INT8 MFMA operations + achievable on the specific accelerator. + + - GIOPs + +.. _pipeline-stats: + +Pipeline statistics +------------------- + +This section reports a number of key performance characteristics of +various execution units on the :doc:`CU `. Refer to +:ref:`ipc-example` for a detailed dive into these metrics, and the +:ref:`scheduler ` the for a high-level overview of execution +units and instruction issue. + +.. list-table:: + :header-rows: 1 + :widths: 20 65 15 + + * - Metric + + - Description + + - Unit + + * - IPC + + - The ratio of the total number of instructions executed on the + :doc:`CU ` over the + :ref:`total active CU cycles `. + + - Instructions per-cycle + + * - IPC (Issued) + + - The ratio of the total number of + (non-:ref:`internal `) instructions issued over + the number of cycles where the :ref:`scheduler ` was + actively working on issuing instructions. Refer to the + :ref:`Issued IPC ` example for further detail. + + - Instructions per-cycle + + * - SALU utilization + + - Indicates what percent of the kernel's duration the + :ref:`SALU ` was busy executing instructions. Computed as the + ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing SALU / :ref:`SMEM ` + instructions over the :ref:`total CU cycles `. + + - Percent + + * - VALU utilization + + - Indicates what percent of the kernel's duration the + :ref:`VALU ` was busy executing instructions. Does not include + :ref:`VMEM ` operations. Computed as the ratio of the total + number of cycles spent by the :ref:`scheduler ` issuing + VALU instructions over the :ref:`total CU cycles `. + + - Percent + + * - VMEM utilization + + - Indicates what percent of the kernel's duration the + :ref:`VMEM ` unit was busy executing instructions, including + both global/generic and spill/scratch operations (see the + :ref:`VMEM instruction count metrics ` for more + detail). Does not include :ref:`VALU ` operations. Computed + as the ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing VMEM instructions over the + :ref:`total CU cycles `. + + - Percent + + * - Branch utilization + + - Indicates what percent of the kernel's duration the + :ref:`branch ` unit was busy executing instructions. + Computed as the ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing branch instructions over the + :ref:`total CU cycles `. + + - Percent + + * - VALU active threads + + - Indicates the average level of :ref:`divergence ` within + a wavefront over the lifetime of the kernel. The number of work-items + that were active in a wavefront during execution of each + :ref:`VALU ` instruction, time-averaged over all VALU + instructions run on all wavefronts in the kernel. + + - Work-items + + * - MFMA utilization + + - Indicates what percent of the kernel's duration the + :ref:`MFMA ` unit was busy executing instructions. Computed as + the ratio of the total number of cycles spent by the + :ref:`MFMA ` was busy over the + :ref:`total CU cycles `. + + - Percent + + * - MFMA instruction cycles + + - The average duration of :ref:`MFMA ` instructions in this + kernel in cycles. Computed as the ratio of the total number of cycles the + MFMA unit was busy over the total number of MFMA instructions. Compare + to, for example, the + `AMD Matrix Instruction Calculator `_. + + - Cycles per instruction + + * - VMEM latency + + - The average number of round-trip cycles (that is, from issue to data + return / acknowledgment) required for a VMEM instruction to complete. + + - Cycles + + * - SMEM latency + + - The average number of round-trip cycles (that is, from issue to data + return / acknowledgment) required for a SMEM instruction to complete. + + - Cycles + +.. note:: + + The branch utilization reported in this section also includes time spent in + other instruction types (namely: ``s_endpgm``) that are *typically* a very + small percentage of the overall kernel execution. This complication is + omitted for simplicity, but may result in small amounts of branch utilization + (typically less than 1%) for otherwise branch-less kernels. + +.. _arithmetic-operations: + +Arithmetic operations +--------------------- + +This section reports the total number of floating-point and integer +operations executed in various precisions. Unlike the +:ref:`compute-speed-of-light` panel, this section reports both +:ref:`VALU ` and :ref:`MFMA ` operations of the same precision +(e.g., F32) in the same metric. Additionally, this panel lets the user +control how the data is normalized (i.e., control the +:ref:`normalization unit `), while the speed-of-light panel does +not. For more detail on how operations are counted see the +:ref:`FLOP counting convention ` section. + +.. warning:: + + As discussed in :ref:`instruction-mix`, the metrics in this section do not + take into account the execution mask of the operation, and will report the + same value even if EXEC is identically zero. + +.. list-table:: + :header-rows: 1 + :widths: 18 65 17 + + * - Metric + + - Description + + - Unit + + * - FLOPs (Total) + + - The total number of floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. + + - FLOP per :ref:`normalization unit ` + + * - IOPs (Total) + + - The total number of integer operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. + + - IOP per :ref:`normalization unit ` + + * - F16 OPs + + - The total number of 16-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. + + - FLOP per :ref:`normalization unit ` + + * - BF16 OPs + + - The total number of 16-bit brain floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. Note: on current CDNA + accelerators, the VALU has no native BF16 instructions. + + - FLOP per :ref:`normalization unit ` + + * - F32 OPs + + - The total number of 32-bit floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. + + - FLOP per :ref:`normalization unit ` + + * - F64 OPs + + - The total number of 64-bit floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. + + - FLOP per :ref:`normalization unit ` + + * - INT8 OPs + + - The total number of 8-bit integer operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per + :ref:`normalization unit `. Note: on current CDNA + accelerators, the VALU has no native INT8 instructions. + + - IOPs per :ref:`normalization unit ` + diff --git a/projects/rocprofiler-compute/docs/conceptual/references.rst b/projects/rocprofiler-compute/docs/conceptual/references.rst new file mode 100644 index 0000000000..9f3d32cd80 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/references.rst @@ -0,0 +1,26 @@ +.. meta:: + :description: Omniperf performance model: References + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, HIP, GCN, LLVM, docs, documentation, training + +********** +References +********** + +Some sections in :doc:`this chapter ` cite the +following publicly available documentation. + +* :hip-training-pdf:`Introduction to AMD GPU Programming with HIP <>` + +* :mi200-isa-pdf:`CDNA2 ISA Reference Guide <>` + +* :cdna2-white-paper:`CDNA2 white paper <>` + +* :hsa-runtime-pdf:`HSA Runtime Programmer's Reference Manual <>` + +* :gcn-crash-course:`The AMD GCN Architecture - A Crash Course (Layla Mah) <>` + +* :mantor-gcn-pdf:`AMD Radeon HD7970 with GCN Architecture <>` + +* :mantor-vega10-pdf:`AMD Radeon Next Generation GPU Architecture - Vega10 <>` + +* :llvm-docs:`LLVM User Guide for AMDGPU Backend <>` diff --git a/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst b/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst new file mode 100644 index 0000000000..8295c45160 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst @@ -0,0 +1,707 @@ +.. meta:: + :description: Omniperf performance model: Shader engine (SE) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, shader, engine, sL1D, L1I, workgroup manager, SPI + +****************** +Shader engine (SE) +****************** + +The :doc:`compute units ` on a CDNA™ accelerator are grouped +together into a higher-level organizational unit called a shader engine (SE): + +.. figure:: ../data/performance-model/selayout.png + :align: center + :alt: Example of CU-grouping into shader engines + :width: 800 + + Example of CU-grouping into shader engines on AMD Instinct MI-series + accelerators. + +The number of CUs on a SE varies from chip to chip -- see for example +:hip-training-pdf:`20`. In addition, newer accelerators such as the AMD +Instinct™ MI 250X have 8 SEs per accelerator. + +For the purposes of Omniperf, we consider resources that are shared between +multiple CUs on a single SE as part of the SE's metrics. + +These include: + +* The :ref:`scalar L1 data cache ` + +* The :ref:`L1 instruction cache ` + +* The :ref:`workgroup manager ` + +.. _desc-sl1d: + +Scalar L1 data cache (sL1D) +=========================== + +The Scalar L1 Data cache (sL1D) can cache data accessed from scalar load +instructions (and scalar store instructions on architectures where they exist) +from wavefronts in the :doc:`CUs `. The sL1D is shared between +multiple CUs (:gcn-crash-course:`36`) -- the exact number of CUs depends on the +architecture in question (3 CUs in GCN™ GPUs and MI100, 2 CUs in +:ref:`MI2XX `) -- and is backed by the :doc:`L2 cache `. + +In typical usage, the data in the sL1D is comprised of: + +* Kernel arguments, such as pointers, + `non-populated `_ + grid and block dimensions, and others + +* HIP's ``__constant__`` memory, when accessed in a provably uniform manner + [#uniform-access]_ + +* Other memory, when accessed in a provably uniform manner, *and* the backing + memory is provably constant [#uniform-access]_ + +.. _desc-sl1d-sol: + +Scalar L1D Speed-of-Light +------------------------- + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for all + workloads. + +The Scalar L1D speed-of-light chart shows some key metrics of the sL1D +cache as a comparison with the peak achievable values of those metrics: + +.. list-table:: + :header-rows: 1 + :widths: 20 65 15 + + * - Metric + + - Description + + - Unit + + * - Bandwidth + + - The number of bytes looked up in the sL1D cache, as a percent of the peak + theoretical bandwidth. Calculated as the ratio of sL1D requests over the + :ref:`total sL1D cycles `. + + - Percent + + * - Cache Hit Rate + + - The percent of sL1D requests that hit [#sl1d-cache]_ on a previously + loaded line in the cache. Calculated as the ratio of the number of sL1D + requests that hit over the number of all sL1D requests. + + - Percent + + * - sL1D-L2 BW + + - The number of bytes requested by the sL1D from the L2 cache, as a percent + of the peak theoretical sL1D → L2 cache bandwidth. Calculated as the + ratio of the total number of requests from the sL1D to the L2 cache over + the :ref:`total sL1D-L2 interface cycles `. + + - Percent + +.. _desc-sl1d-stats: + +Scalar L1D cache accesses +------------------------- + +This panel gives more detail on the types of accesses made to the sL1D, +and the hit/miss statistics. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Requests + + - The total number of requests, of any size or type, made to the sL1D per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Hits + + - The total number of sL1D requests that hit on a previously loaded cache + line, per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Misses - Non Duplicated + + - The total number of sL1D requests that missed on a cache line that *was + not* already pending due to another request, per + :ref:`normalization unit `. See :ref:`desc-sl1d-sol` + for more detail. + + - Requests per :ref:`normalization unit ` + + * - Misses - Duplicated + + - The total number of sL1D requests that missed on a cache line that *was* + already pending due to another request, per + :ref:`normalization unit `. See + :ref:`desc-sl1d-sol` for more detail. + + - Requests per :ref:`normalization unit ` + + * - Cache Hit Rate + + - Indicates the percent of sL1D requests that hit on a previously loaded + line the cache. The ratio of the number of sL1D requests that hit + [#sl1d-cache]_ over the number of all sL1D requests. + + - Percent + + * - Read Requests (Total) + + - The total number of sL1D read requests of any size, per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Atomic Requests + + - The total number of sL1D atomic requests of any size, per + :ref:`normalization unit `. Typically unused on CDNA + accelerators. + + - Requests per :ref:`normalization unit ` + + * - Read Requests (1 DWord) + + - The total number of sL1D read requests made for a single dword of data + (4B), per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Read Requests (2 DWord) + + - The total number of sL1D read requests made for a two dwords of data + (8B), per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Read Requests (4 DWord) + + - The total number of sL1D read requests made for a four dwords of data + (16B), per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Read Requests (8 DWord) + + - The total number of sL1D read requests made for a eight dwords of data + (32B), per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Read Requests (16 DWord) + + - The total number of sL1D read requests made for a sixteen dwords of data + (64B), per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + +.. _desc-sl1d-l2-interface: + +sL1D ↔ L2 Interface +------------------- + +This panel gives more detail on the data requested across the +sL1D↔ +:doc:`L2 ` interface. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - sL1D-L2 BW + + - The total number of bytes read from, written to, or atomically updated + across the sL1D↔:doc:`L2 ` interface, per + :ref:`normalization unit `. Note that sL1D writes + and atomics are typically unused on current CDNA accelerators, so in the + majority of cases this can be interpreted as an sL1D→L2 read bandwidth. + + - Bytes per :ref:`normalization unit ` + + * - Read Requests + + - The total number of read requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Write Requests + + - The total number of write requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on + current CDNA accelerators. + + - Requests per :ref:`normalization unit ` + + * - Atomic Requests + + - The total number of atomic requests from sL1D to the + :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on + current CDNA accelerators. + + - Requests per :ref:`normalization unit ` + + * - Stall Cycles + + - The total number of cycles the sL1D↔ + :doc:`L2 ` interface was stalled, per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + +.. rubric:: Footnotes + +.. [#uniform-access] The scalar data cache is used when the compiler emits + scalar loads to access data. This requires that the data be *provably* + uniformly accesses (that is, the compiler can verify that all work-items in a + wavefront access the same data), *and* that the data can be proven to be + read-only (for instance, HIP's ``__constant__`` memory, or properly + ``__restrict__``\ed pointers to avoid write-aliasing). Access of + ``__constant__`` memory for example is not guaranteed to go through the sL1D + if the wavefront loads a non-uniform value. + +.. [#sl1d-cache] Unlike the :doc:`vL1D ` and + :doc:`L2 ` caches, the sL1D cache on AMD Instinct MI-series CDNA + accelerators does *not* use the "hit-on-miss" approach to reporting cache + hits. That is, if while satisfying a miss, another request comes in that + would hit on the same pending cache line, the subsequent request will be + counted as a *duplicated miss*. + +.. _desc-l1i: + +L1 Instruction Cache (L1I) +========================== + +As with the :ref:`sL1D `, the L1 Instruction (L1I) cache is shared +between multiple CUs on a shader-engine, where the precise number of CUs +sharing a L1I depends on the architecture in question (:gcn-crash-course:`36`) +and is backed by the :doc:`L2 cache `. Unlike the sL1D, the +instruction cache is read-only. + +.. _desc-l1i-sol: + +L1I Speed-of-Light +------------------ + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for all + workloads. + +The L1 Instruction Cache speed-of-light chart shows some key metrics of +the L1I cache as a comparison with the peak achievable values of those +metrics: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Bandwidth + + - The number of bytes looked up in the L1I cache, as a percent of the peak + theoretical bandwidth. Calculated as the ratio of L1I requests over the + :ref:`total L1I cycles `. + + - Percent + + * - Cache Hit Rate + + - The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit + [#l1i-cache]_ over the number of all L1I requests. + + - Percent + + * - L1I-L2 BW + + - The percent of the peak theoretical L1I → L2 cache request bandwidth + achieved. Calculated as the ratio of the total number of requests from + the L1I to the L2 cache over the + :ref:`total L1I-L2 interface cycles `. + + - Percent + + * - Instruction Fetch Latency + + - The average number of cycles spent to fetch instructions to a + :doc:`CU `. + + - Cycles + +.. _desc-l1i-stats: + +L1I cache accesses +------------------ + +This panel gives more detail on the hit/miss statistics of the L1I: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Requests + + - The total number of requests made to the L1I per + :ref:`normalization-unit `. + + - Requests per :ref:`normalization unit `. + + * - Hits + + - The total number of L1I requests that hit on a previously loaded cache + line, per :ref:`normalization-unit `. + + - Requests per :ref:`normalization unit ` + + * - Misses - Non Duplicated + + - The total number of L1I requests that missed on a cache line that + *were not* already pending due to another request, per + :ref:`normalization-unit `. See note in + :ref:`desc-l1i-sol` for more detail. + + - Requests per :ref:`normalization unit `. + + * - Misses - Duplicated + + - The total number of L1I requests that missed on a cache line that *were* + already pending due to another request, per + :ref:`normalization-unit `. See note in + :ref:`desc-l1i-sol` for more detail. + + - Requests per :ref:`normalization unit ` + + * - Cache Hit Rate + + - The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded + line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + + - Percent + +L1I - L2 interface +------------------ + +This panel gives more detail on the data requested across the +L1I-:doc:`L2 ` interface. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - L1I-L2 BW + + - The total number of bytes read across the L1I-:doc:`L2 ` + interface, per :ref:`normalization unit `. + + - Bytes per :ref:`normalization unit ` + +.. rubric:: Footnotes + +.. [#l1i-cache] Unlike the :doc:`vL1D ` and + :doc:`L2 ` caches, the L1I cache on AMD Instinct MI-series CDNA + accelerators does *not* use the "hit-on-miss" approach to reporting cache + hits. That is, if while satisfying a miss, another request comes in that + would hit on the same pending cache line, the subsequent request will be + counted as a *duplicated miss*. + +.. _desc-spi: + +Workgroup manager (SPI) +======================= + +The workgroup manager (SPI) is the bridge between the +:doc:`command processor ` and the +:doc:`compute units `. After the command processor processes a +kernel dispatch, it will then pass the dispatch off to the workgroup manager, +which then schedules :ref:`workgroups ` onto the compute units. +As workgroups complete execution and resources become available, the +workgroup manager will schedule new workgroups onto compute units. The workgroup +manager’s metrics therefore are focused on reporting the following: + +* Utilizations of various parts of the accelerator that the workgroup + manager interacts with (and the workgroup manager itself) + +* How many workgroups were dispatched, their size, and how many + resources they used + +* Percent of scheduler opportunities (cycles) where workgroups failed + to dispatch, and + +* Percent of scheduler opportunities (cycles) where workgroups failed + to dispatch due to lack of a specific resource on the CUs (for instance, too + many VGPRs allocated) + +This gives you an idea of why the workgroup manager couldn’t schedule more +wavefronts onto the device, and is most useful for workloads that you suspect to +be limited by scheduling or launch rate. + +As discussed in :doc:`Command processor `, the command +processor on AMD Instinct MI-series architectures contains four hardware +scheduler-pipes, each with eight software threads (:mantor-vega10-pdf:`19`). Each +scheduler-pipe can issue a kernel dispatch to the workgroup manager to schedule +concurrently. Therefore, some workgroup manager metrics are presented relative +to the utilization of these scheduler-pipes (for instance, whether all four are +issuing concurrently). + +.. note:: + + Current versions of the profiling libraries underlying Omniperf attempt to + serialize concurrent kernels running on the accelerator, as the performance + counters on the device are global (that is, shared between concurrent + kernels). This means that these scheduler-pipe utilization metrics are + expected to reach (for example) a maximum of one pipe active -- only 25%. + +Workgroup manager utilizations +------------------------------ + +This section describes the utilization of the workgroup manager, and the +hardware components it interacts with. + +.. list-table:: + :header-rows: 1 + :widths: 20 65 15 + + * - Metric + + - Description + + - Unit + + * - Accelerator utilization + + - The percent of cycles in the kernel where the accelerator was actively + doing any work. + + - Percent + + * - Scheduler-pipe utilization + + - The percent of :ref:`total scheduler-pipe cycles ` in + the kernel where the scheduler-pipes were actively doing any work. Note: + this value is expected to range between 0% and 25%. See :ref:`desc-spi`. + + - Percent + + * - Workgroup manager utilization + + - The percent of cycles in the kernel where the workgroup manager was + actively doing any work. + + - Percent + + * - Shader engine utilization + + - The percent of :ref:`total shader engine cycles ` in the + kernel where any CU in a shader-engine was actively doing any work, + normalized over all shader-engines. Low values (e.g., << 100%) indicate + that the accelerator was not fully saturated by the kernel, or a + potential load-imbalance issue. + + - Percent + + * - SIMD utilization + + - The percent of :ref:`total SIMD cycles ` in the kernel + where any :ref:`SIMD ` on a CU was actively doing any work, + summed over all CUs. Low values (less than 100%) indicate that the + accelerator was not fully saturated by the kernel, or a potential + load-imbalance issue. + + - Percent + + * - Dispatched workgroups + + - The total number of workgroups forming this kernel launch. + + - Workgroups + + * - Dispatched wavefronts + + - The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + + - Wavefronts + + * - VGPR writes + + - The average number of cycles spent initializing :ref:`VGPRs ` + at wave creation. + + - Cycles/wave + + * - SGPR Writes + + - The average number of cycles spent initializing :ref:`SGPRs ` + at wave creation. + + - Cycles/wave + +Resource allocation +------------------- + +This panel gives more detail on how workgroups and wavefronts were scheduled +onto compute units, and what occupancy limiters they hit -- if any. When +analyzing these metrics, you should also take into account their +achieved occupancy -- such as +:ref:`wavefront occupancy `. A kernel may be occupancy +limited by LDS usage, for example, but may still achieve high occupancy levels +such that improving occupancy further may not improve performance. See +:ref:`occupancy-example` for details. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Not-scheduled rate (Workgroup Manager) + + - The percent of :ref:`total scheduler-pipe cycles ` in + the kernel where a workgroup could not be scheduled to a + :doc:`CU ` due to a bottleneck within the workgroup manager + rather than a lack of a CU or :ref:`SIMD ` with sufficient + resources. Note: this value is expected to range between 0-25%. See note + in :ref:`workgroup manager ` description. + + - Percent + + * - Not-scheduled rate (Scheduler-Pipe) + + - The percent of :ref:`total scheduler-pipe cycles ` in + the kernel where a workgroup could not be scheduled to a + :doc:`CU ` due to a bottleneck within the scheduler-pipes + rather than a lack of a CU or :ref:`SIMD ` with sufficient + resources. Note: this value is expected to range between 0-25%, see note + in :ref:`workgroup manager ` description. + + - Percent + + * - Scheduler-Pipe Stall Rate + + - The percent of :ref:`total scheduler-pipe cycles ` in + the kernel where a workgroup could not be scheduled to a + :doc:`CU ` due to occupancy limitations (like a lack of a + CU or :ref:`SIMD ` with sufficient resources). Note: this + value is expected to range between 0-25%, see note in + :ref:`workgroup manager ` description. + + - Percent + + * - Scratch Stall Rate + + - The percent of :ref:`total shader-engine cycles ` in the + kernel where a workgroup could not be scheduled to a + :doc:`CU ` due to lack of + :ref:`private (a.k.a., scratch) memory ` slots. While this + can reach up to 100%, note that the actual occupancy limitations on a + kernel using private memory are typically quite small (for example, less + than 1% of the total number of waves that can be scheduled to an + accelerator). + + - Percent + + * - Insufficient SIMD Waveslots + + - The percent of :ref:`total SIMD cycles ` in the kernel + where a workgroup could not be scheduled to a :ref:`SIMD ` + due to lack of available :ref:`waveslots `. + + - Percent + + * - Insufficient SIMD VGPRs + + - The percent of :ref:`total SIMD cycles ` in the kernel + where a workgroup could not be scheduled to a :ref:`SIMD ` + due to lack of available :ref:`VGPRs `. + + - Percent + + * - Insufficient SIMD SGPRs + + - The percent of :ref:`total SIMD cycles ` in the kernel + where a workgroup could not be scheduled to a :ref:`SIMD ` + due to lack of available :ref:`SGPRs `. + + - Percent + + * - Insufficient CU LDS + + - The percent of :ref:`total CU cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` + due to lack of available :doc:`LDS `. + + - Percent + + * - Insufficient CU Barriers + + - The percent of :ref:`total CU cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` + due to lack of available :ref:`barriers `. + + - Percent + + * - Reached CU Workgroup Limit + + - The percent of :ref:`total CU cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` + due to limits within the workgroup manager. This is expected to be + always be zero on CDNA2 or newer accelerators (and small for previous + accelerators). + + - Percent + + * - Reached CU Wavefront Limit + + - The percent of :ref:`total CU cycles ` in the kernel + where a wavefront could not be scheduled to a :doc:`CU ` + due to limits within the workgroup manager. This is expected to be + always be zero on CDNA2 or newer accelerators (and small for previous + accelerators). + + - Percent + diff --git a/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst b/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst new file mode 100644 index 0000000000..f01be4b67b --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst @@ -0,0 +1,318 @@ +.. meta:: + :description: Omniperf performance model: System Speed-of-Light + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, system, speed of light + +********************* +System Speed-of-Light +********************* + +System Speed-of-Light summarizes some of the key metrics from various sections +of Omniperf’s profiling report. + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for + all workloads. + + Also, not all metrics -- such as FLOP counters -- are available on all AMD + Instinct™ MI-series accelerators. For more detail on how operations are + counted, see the :ref:`metrics-flop-count` section. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - :ref:`VALU ` FLOPs + + - The total floating-point operations executed per second on the + :ref:`VALU `. This is also presented as a percent of the peak + theoretical FLOPs achievable on the specific accelerator. Note: this does + not include any floating-point operations from :ref:`MFMA ` + instructions. + + - GFLOPs + + * - :ref:`VALU ` IOPs + + - The total integer operations executed per second on the + :ref:`VALU `. This is also presented as a percent of the peak + theoretical IOPs achievable on the specific accelerator. Note: this does + not include any integer operations from :ref:`MFMA ` + instructions. + + - GIOPs + + * - :ref:`MFMA ` FLOPs (BF16) + + - The total number of 16-bit brain floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit + brain floating point operations from :ref:`VALU ` + instructions. This is also presented as a percent of the peak theoretical + BF16 MFMA operations achievable on the specific accelerator. + + - GFLOPs + + * - :ref:`MFMA ` FLOPs (F16) + + - The total number of 16-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F16 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - :ref:`MFMA ` FLOPs (F32) + + - The total number of 32-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 32-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F32 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - :ref:`MFMA ` FLOPs (F64) + + - The total number of 64-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 64-bit + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F64 MFMA + operations achievable on the specific accelerator. + + - GFLOPs + + * - :ref:`MFMA ` IOPs (INT8) + + - The total number of 8-bit integer :ref:`MFMA ` operations + executed per second. Note: this does not include any 8-bit integer + operations from :ref:`VALU ` instructions. This is also + presented as a percent of the peak theoretical INT8 MFMA operations + achievable on the specific accelerator. + + - GIOPs + + * - :ref:`SALU ` utilization + + - Indicates what percent of the kernel's duration the + :ref:`SALU ` was busy executing instructions. Computed as the + ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing :ref:`SALU ` or + :ref:`SMEM ` instructions over the + :ref:`total CU cycles `. + + - Percent + + * - :ref:`VALU ` utilization + + - Indicates what percent of the kernel's duration the + :ref:`VALU ` was busy executing instructions. Does not include + :ref:`VMEM ` operations. Computed as the ratio of the total + number of cycles spent by the :ref:`scheduler ` issuing + :ref:`VALU ` instructions over the + :ref:`total CU cycles `. + + - Percent + + * - :ref:`MFMA ` utilization + + - Indicates what percent of the kernel's duration the + :ref:`MFMA ` unit was busy executing instructions. Computed as + the ratio of the total number of cycles the MFMA was busy over the + :ref:`total CU cycles `. + + - Percent + + * - :ref:`VMEM ` utilization + + - Indicates what percent of the kernel's duration the + :ref:`VMEM ` unit was busy executing instructions, including + both global/generic and spill/scratch operations (see the + :ref:`VMEM instruction count metrics `) for more + detail). Does not include :ref:`VALU ` operations. Computed as + the ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing VMEM instructions over the + :ref:`total CU cycles `. + + - Percent + + * - :ref:`Branch ` utilization + + - Indicates what percent of the kernel's duration the + :ref:`branch ` unit was busy executing instructions. + Computed as the ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing :ref:`branch ` + instructions over the :ref:`total CU cycles ` + + - Percent + + * - :ref:`VALU ` active threads + + - Indicates the average level of :ref:`divergence ` within + a wavefront over the lifetime of the kernel. The number of work-items + that were active in a wavefront during execution of each + :ref:`VALU ` instruction, time-averaged over all VALU + instructions run on all wavefronts in the kernel. + + - Work-items + + * - IPC + + - The ratio of the total number of instructions executed on the + :doc:`CU ` over the + :ref:`total active CU cycles `. This is also + presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + + - Instructions per-cycle + + * - Wavefront occupancy + + - The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for + short-running kernels (less than 1ms). This is also presented as a + percent of the peak theoretical occupancy achievable on the specific + accelerator. + + - Wavefronts + + * - :doc:`LDS ` theoretical bandwidth + + - Indicates the maximum amount of bytes that could have been loaded from, + stored to, or atomically updated in the LDS per unit time (see + :ref:`LDS Bandwidth ` example for more detail). This is + also presented as a percent of the peak theoretical F64 MFMA operations + achievable on the specific accelerator. + + - GB/s + + * - :doc:`LDS ` bank conflicts/access + + - The ratio of the number of cycles spent in the + :doc:`LDS scheduler ` due to bank conflicts (as + determined by the conflict resolution hardware) to the base number of + cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the + Bank Conflict Rate). + + - Conflicts/Access + + * - :doc:`vL1D ` cache hit rate + + - The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the + :ref:`vL1D cache RAM `. + + - Percent + + * - :doc:`vL1D ` cache bandwidth + + - The number of bytes looked up in the vL1D cache as a result of + :ref:`VMEM ` instructions per unit time. The number of bytes + is calculated as the number of cache lines requested multiplied by the + cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. This is also presented as a + percent of the peak theoretical bandwidth achievable on the specific + accelerator. + + - GB/s + + * - :doc:`L2 ` cache hit rate + + - The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 + cache. + + - Percent + + * - :doc:`L2 ` cache bandwidth + + - The number of bytes looked up in the L2 cache per unit time. The number + of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + e.g., if only a single value is requested in a cache line, the data + movement will still be counted as a full cache line. This is also + presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + + - GB/s + + * - :doc:`L2 `-fabric read BW + + - The number of bytes read by the L2 over the + :ref:`Infinity Fabric™ interface ` per unit time. This is also + presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + + - GB/s + + * - :doc:`L2 `-fabric write and atomic BW + + - The number of bytes sent by the L2 over the + :ref:`Infinity Fabric interface ` by write and atomic + operations per unit time. This is also presented as a percent of the peak + theoretical bandwidth achievable on the specific accelerator. + + - GB/s + + * - :doc:`L2 `-fabric read latency + + - The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + + - Cycles + + * - :doc:`L2 `-fabric write latency + + - The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + + - Cycles + + * - :ref:`sL1D ` cache hit rate + + - The percent of sL1D requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of sL1D requests that hit + over the number of all sL1D requests. + + - Percent + + * - :ref:`sL1D ` bandwidth + + - The number of bytes looked up in the sL1D cache per unit time. This is + also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + + - GB/s + + * - :ref:`L1I ` bandwidth + + - The number of bytes looked up in the L1I cache per unit time. This is + also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + + - GB/s + + * - :ref:`L1I ` cache hit rate + + - The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit + over the number of all L1I requests. + + - Percent + + * - :ref:`L1I ` fetch latency + + - The average number of cycles spent to fetch instructions to a + :doc:`CU `. + + - Cycles + diff --git a/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst b/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst new file mode 100644 index 0000000000..086c195be5 --- /dev/null +++ b/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst @@ -0,0 +1,767 @@ +.. meta:: + :description: Omniperf performance model: Vector L1 cache (vL1D) + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, vector, l1, cache, vl1d + +********************** +Vector L1 cache (vL1D) +********************** + +The vector L1 data (vL1D) cache is local to each +:doc:`compute unit ` on the accelerator, and handles vector memory +operations issued by a wavefront. The vL1D cache consists of several components: + +* An address processing unit, also known as the + :ref:`texture addresser ` which receives commands (instructions) and + write/atomic data from the :doc:`compute unit `, and coalesces + them into fewer requests for the cache to process. + +* An address translation unit, also known as the + :ref:`L1 Unified Translation Cache (UTCL1) `, that translates + requests from virtual to physical addresses for lookup in the cache. The + translation unit has an L1 translation lookaside buffer (L1TLB) to reduce the + cost of repeated translations. + +* A Tag RAM that looks up whether a requested cache line is already + present in the :ref:`cache `. + +* The result of the Tag RAM lookup is placed in the L1 cache controller + for routing to the correct location; for instance, the + :ref:`L2 Memory Interface ` for misses or the + :ref:`cache RAM ` for hits. + +* The cache RAM, also known as the :ref:`texture cache (TC) `, stores + requested data for potential reuse. Data returned from the + :doc:`L2 cache ` is placed into the cache RAM before going down the + :ref:`data-return path `. + +* A backend data processing unit, also known as the + :ref:`texture data (TD) ` that routes data back to the requesting + :doc:`compute unit `. + +Together, this complex is known as the vL1D, or Texture Cache per Pipe +(TCP). A simplified diagram of the vL1D is presented below: + +.. figure:: ../data/performance-model/l1perf_model.png + :align: center + :alt: Performance model of the vL1D Cache on AMD Instinct + :width: 800 + + Performance model of the vL1D Cache on AMD Instinct MI-series accelerators. + +.. _vl1d-sol: + +vL1D Speed-of-Light +=================== + +.. warning:: + + The theoretical maximum throughput for some metrics in this section are + currently computed with the maximum achievable clock frequency, as reported + by ``rocminfo``, for an accelerator. This may not be realistic for all + workloads. + +The vL1D’s speed-of-light chart shows several key metrics for the vL1D +as a comparison with the peak achievable values of those metrics. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Hit Rate + + - The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ + in vL1D cache over the total number of cache line requests to the + :ref:`vL1D Cache RAM `. + + - Percent + + * - Bandwidth + + - The number of bytes looked up in the vL1D cache as a result of + :ref:`VMEM ` instructions, as a percent of the peak + theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the + data movement will still be counted as a full cache line. + + - Percent + + * - Utilization + + - Indicates how busy the :ref:`vL1D Cache RAM ` was during the + kernel execution. The number of cycles where the vL1D Cache RAM is + actively processing any request divided by the number of cycles where the + vL1D is active [#vl1d-activity]_. + + - Percent + + * - Coalescing + + - Indicates how well memory instructions were coalesced by the + :ref:`address processing unit `, ranging from uncoalesced (25%) + to fully coalesced (100%). Calculated as the average number of + :ref:`thread-requests ` generated per instruction + divided by the ideal number of thread-requests per instruction. + + - Percent + +.. _desc-ta: + +Address processing unit or Texture Addresser (TA) +================================================= + +The :doc:`vL1D `’s address processing unit receives vector +memory instructions (commands) along with write/atomic data from a +:doc:`compute unit ` and is responsible for coalescing these into +requests for lookup in the :ref:`vL1D RAM `. The address processor +passes information about the commands (coalescing state, destination SIMD, +etc.) to the :ref:`data processing unit ` for use after the requested +data has been retrieved. + +Omniperf reports several metrics to indicate performance bottlenecks in +the address processing unit, which are broken down into a few +categories: + +- :ref:`ta-busy-stall` + +- :ref:`ta-instruction-counts` + +- :ref:`ta-spill-stack` + +.. _ta-busy-stall: + +Busy / stall metrics +-------------------- + +When executing vector memory instructions, the compute unit must send an +address (and in the case of writes/atomics, data) to the address +processing unit. When the front-end cannot accept any more addresses, it +must backpressure the wave-issue logic for the VMEM pipe and prevent the +issue of further vector memory instructions. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Busy + + - Percent of the :ref:`total CU cycles ` the address + processor was busy + + - Percent + + * - Address Stall + + - Percent of the :ref:`total CU cycles ` the address + processor was stalled from sending address requests further into the vL1D + pipeline + + - Percent + + * - Data Stall + + - Percent of the :ref:`total CU cycles ` the address + processor was stalled from sending write/atomic data further into the + vL1D pipeline + + - Percent + + * - Data-Processor → Address Stall + + - Percent of :ref:`total CU cycles ` the address processor + was stalled waiting to send command data to the + :ref:`data processor ` + + - Percent + +.. _ta-instruction-counts: + +Instruction counts +------------------ + +The address processor also counts instruction types to give the user +information on what sorts of memory instructions were executed by the +kernel. These are broken down into a few major categories: + +.. list-table:: + :header-rows: 1 + + * - Memory type + + - Usage + + - Description + + * - Global + + - Global memory + + - Global memory can be seen by all threads from a process. This includes + the local accelerator's DRAM, remote accelerator's DRAM, and the host's + DRAM. + + * - Generic + + - Dynamic address spaces + + - Generic memory, or "flat" memory, is used when the compiler cannot + statically prove that a pointer is to memory in one or the other address + spaces. The pointer could dynamically point into global, local, constant, + or private memory. + + * - Private Memory + + - Register spills / Stack memory + + - Private memory, or "scratch" memory, is only visible to a particular + :ref:`work-item ` in a particular + :ref:`workgroup `. On AMD Instinct™ MI-series + accelerators, private memory is used to implement both register spills + and stack memory accesses. + +The address processor counts these instruction types as follows: + +.. list-table:: + :header-rows: 1 + + * - Type + + - Description + + - Unit + + * - Global/Generic + + - The total number of global & generic memory instructions executed on all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Global/Generic Read + + - The total number of global & generic memory read instructions executed on + all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Global/Generic Write + + - The total number of global & generic memory write instructions executed + on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Global/Generic Atomic + + - The total number of global & generic memory atomic (with and without + return) instructions executed on all :doc:`compute units ` + on the accelerator, per :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Spill/Stack + + - The total number of spill/stack memory instructions executed on all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Spill/Stack Read + + - The total number of spill/stack memory read instructions executed on all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Spill/Stack Write + + - The total number of spill/stack memory write instructions executed on all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. + + - Instruction per :ref:`normalization unit ` + + * - Spill/Stack Atomic + + - The total number of spill/stack memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the + accelerator, per :ref:`normalization unit `. + Typically unused as these memory operations are typically used to + implement thread-local storage. + + - Instructions per :ref:`normalization unit ` + +.. note:: + + The above is a simplified model specifically for the HIP programming language + that does not consider inline assembly usage, constant memory usage or + texture memory. + + These categories correspond to: + + * Global/Generic: global and flat memory operations, that are used for global + and generic memory access. + + * Spill/Stack: buffer instructions which are used on the MI50, MI100, and + :ref:`MI2XX ` accelerators for register spills / stack memory. + + These concepts are described in more detail in the :ref:`memory-spaces`, + while generic memory access is explored in the + :ref:`generic memory benchmark ` section. + +.. _ta-spill-stack: + +Spill / stack metrics +--------------------- + +Finally, the address processing unit contains a separate coalescing +stage for spill/stack memory, and thus reports: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Spill/Stack Total Cycles + + - The number of cycles the address processing unit spent working on + spill/stack instructions, per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Spill/Stack Coalesced Read Cycles + + - The number of cycles the address processing unit spent working on + coalesced spill/stack read instructions, per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + + * - Spill/Stack Coalesced Write Cycles + + - The number of cycles the address processing unit spent working on + coalesced spill/stack write instructions, per + :ref:`normalization unit `. + + - Cycles per :ref:`normalization unit ` + +.. _desc-utcl1: + +L1 Unified Translation Cache (UTCL1) +==================================== + +After a vector memory instruction has been processed/coalesced by the +address processing unit of the vL1D, it must be translated from a +virtual to physical address. This process is handled by the L1 Unified +Translation Cache (UTCL1). This cache contains a L1 Translation +Lookaside Buffer (TLB) which stores recently translated addresses to +reduce the cost of subsequent re-translations. + +Omniperf reports the following L1 TLB metrics: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Requests + + - The number of translation requests made to the UTCL1 per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Hits + + - The number of translation requests that hit in the UTCL1, and could be + reused, per :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Hit Ratio + + - The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + + - Percent + + * - Translation Misses + + - The total number of translation requests that missed in the UTCL1 due to + translation not being present in the cache, per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - Permission Misses + + - The total number of translation requests that missed in the UTCL1 due to + a permission error, per :ref:`normalization unit `. + This is unused and expected to be zero in most configurations for modern + CDNA™ accelerators. + + - Requests per :ref:`normalization unit ` + +.. note:: + + On current CDNA accelerators, such as the :ref:`MI2XX `, the + UTCL1 does *not* count hit-on-miss requests. + +.. _desc-tc: + +Vector L1 Cache RAM or Texture Cache (TC) +========================================= + +After coalescing in the :ref:`address processing unit ` of the v1LD, +and address translation in the :ref:`L1 TLB ` the request proceeds +to the Cache RAM stage of the pipeline. Incoming requests are looked up +in the cache RAMs using parts of the physical address as a tag. Hits +will be returned through the :ref:`data-return path `, while misses +will routed out to the :doc:`L2 Cache ` for servicing. + +The metrics tracked by the vL1D RAM include: + +- :ref:`Stall metrics ` + +- :ref:`Cache access metrics ` + +- :ref:`vL1D-L2 transaction detail metrics ` + +.. _vl1d-cache-stall-metrics: + +vL1D cache stall metrics +------------------------ + +The vL1D also reports where it is stalled in the pipeline, which may +indicate performance limiters of the cache. A stall in the pipeline may +result in backpressuring earlier parts of the pipeline, e.g., a stall on +L2 requests may backpressure the wave-issue logic of the :ref:`VMEM ` +pipe and prevent it from issuing more vector memory instructions until +the vL1D’s outstanding requests are completed. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Stalled on L2 Data + + - The ratio of the number of cycles where the vL1D is stalled waiting for + requested data to return from the :doc:`L2 cache ` divided by + the number of cycles where the vL1D is active [#vl1d-activity]_. + + - Percent + + * - Stalled on L2 Requests + + - The ratio of the number of cycles where the vL1D is stalled waiting to + issue a request for data to the :doc:`L2 cache ` divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + + - Percent + + * - Tag RAM Stall (Read/Write/Atomic) + + - The ratio of the number of cycles where the vL1D is stalled due to + Read/Write/Atomic requests with conflicting tags being looked up + concurrently, divided by the number of cycles where the + vL1D is active [#vl1d-activity]_. + + - Percent + +.. _vl1d-cache-access-metrics: + +vL1D cache access metrics +------------------------- + +The vL1D cache access metrics broadly indicate the type of requests +incoming from the :ref:`cache front-end `, the number of requests that +were serviced by the vL1D, and the number & type of outgoing requests to +the :doc:`L2 cache `. In addition, this section includes the +approximate latencies of accesses to the cache itself, along with +latencies of read/write memory operations to the :doc:`L2 cache `. + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Total Requests + + - The total number of incoming requests from the + :ref:`address processing unit ` after coalescing. + + - Requests + + * - Total read/write/atomic requests + + - The total number of incoming read/write/atomic requests from the + :ref:`address processing unit ` after coalescing per + :ref:`normalization unit ` + + - Requests per :ref:`normalization unit ` + + * - Cache Bandwidth + + - The number of bytes looked up in the vL1D cache as a result of + :ref:`VMEM ` instructions per + :ref:`normalization unit `. The number of bytes is + calculated as the number of cache lines requested multiplied by the cache + line size. This value does not consider partial requests, so for + instance, if only a single value is requested in a cache line, the data + movement will still be counted as a full cache line. + + - Bytes per :ref:`normalization unit ` + + * - Cache Hit Rate [#vl1d-hit]_ + + - The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the + :ref:`vL1D Cache RAM `. + + - Percent + + * - Cache Accesses + + - The total number of cache line lookups in the vL1D. + + - Cache lines + + * - Cache Hits [#vl1d-hit]_ + + - The number of cache accesses minus the number of outgoing requests to the + :doc:`L2 cache `, that is, the number of cache line requests + serviced by the :ref:`vL1D Cache RAM ` per + :ref:`normalization unit `. + + - Cache lines per :ref:`normalization unit ` + + * - Invalidations + + - The number of times the vL1D was issued a write-back invalidate command + during the kernel's execution per + :ref:`normalization unit `. This may be triggered + by, for instance, the ``buffer_wbinvl1`` instruction. + + - Invalidations per :ref:`normalization unit ` + + * - L1-L2 Bandwidth + + - The number of bytes transferred across the vL1D-L2 interface as a result + of :ref:`VMEM ` instructions, per + :ref:`normalization unit `. The number of bytes is + calculated as the number of cache lines requested multiplied by the cache + line size. This value does not consider partial requests, so for + instance, if only a single value is requested in a cache line, the data + movement will still be counted as a full cache line. + + - Bytes per :ref:`normalization unit ` + + * - L1-L2 Reads + + - The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the + :doc:`L2 Cache ` per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - L1-L2 Writes + + - The number of write requests to a vL1D cache line that were sent through + the vL1D to the :doc:`L2 cache `, per + :ref:`normalization unit `. + + - Requests per :ref:`normalization unit ` + + * - L1-L2 Atomics + + - The number of atomic requests that are sent through the vL1D to the + :doc:`L2 cache `, per + :ref:`normalization unit `. This includes requests + for atomics with, and without return. + + - Requests per :ref:`normalization unit ` + + * - L1 Access Latency + + - Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + + - Cycles + + * - L1-L2 Read Access Latency + + - Calculated as the average number of cycles that the vL1D cache took to + issue and receive read requests from the :doc:`L2 Cache `. This + number also includes requests for atomics with return values. + + - Cycles + + * - L1-L2 Write Access Latency + + - Calculated as the average number of cycles that the vL1D cache took to + issue and receive acknowledgement of a write request to the + :doc:`L2 Cache `. This number also includes requests for + atomics without return values. + + - Cycles + +.. note:: + + All cache accesses in vL1D are for a single cache line's worth of data. + The size of a cache line may vary, however on current AMD Instinct MI CDNA + accelerators and GCN™ GPUs the L1 cache line size is 64B. + +.. rubric :: Footnotes + +.. [#vl1d-hit] The vL1D cache on AMD Instinct MI-series CDNA accelerators + uses a "hit-on-miss" approach to reporting cache hits. That is, if while + satisfying a miss, another request comes in that would hit on the same + pending cache line, the subsequent request will be counted as a "hit". + Therefore, it is also important to consider the access latency metric in the + :ref:`Cache access metrics ` section when + evaluating the vL1D hit rate. + +.. [#vl1d-activity] Omniperf considers the vL1D to be active when any part of + the vL1D (excluding the :ref:`address processor ` and + :ref:`data return ` units) are active, for example, when performing + a translation, waiting for data, accessing the Tag or Cache RAMs, etc. + +.. _vl1d-l2-transaction-detail: + +vL1D - L2 Transaction Detail +---------------------------- + +This section provides a more granular look at the types of requests made +to the :doc:`L2 cache `. These are broken down by the operation type +(read / write / atomic, with, or without return), and the +:ref:`memory type `. + +.. _desc-td: + +Vector L1 data-return path or Texture Data (TD) +=============================================== + +The data-return path of the vL1D cache, also known as the Texture Data +(TD) unit, is responsible for routing data returned from the +:ref:`vL1D cache RAM ` back to a wavefront on a SIMD. As described in +the :ref:`vL1D cache front-end ` section, the data-return path is passed +information about the space requirements and routing for data requests +from the :ref:`VALU `. When data is returned from the +:ref:`vL1D cache RAM `, it is matched to this previously stored request +data, and returned to the appropriate SIMD. + +Omniperf reports the following vL1D data-return path metrics: + +.. list-table:: + :header-rows: 1 + + * - Metric + + - Description + + - Unit + + * - Data-return Busy + + - Percent of the :ref:`total CU cycles ` the data-return + unit was busy processing or waiting on data to return to the + :doc:`CU `. + + - Percent + + * - Cache RAM → Data-return Stall + + - Percent of the :ref:`total CU cycles ` the data-return + unit was stalled on data to be returned from the + :ref:`vL1D Cache RAM `. + + - Percent + + * - Workgroup manager → Data-return Stall + + - Percent of the :ref:`total CU cycles ` the data-return + unit was stalled by the :ref:`workgroup manager ` due to + initialization of registers as a part of launching new workgroups. + + - Percent + + * - Coalescable Instructions + + - The number of instructions submitted to the + :ref:`data-return unit ` by the + :ref:`address processor ` that were found to be coalescable, per + :ref:`normalization unit `. + + - Instructions per :ref:`normalization unit ` + + * - Read Instructions + + - The number of read instructions submitted to the + :ref:`data-return unit ` by the + :ref:`address processor ` summed over all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. This is expected to be + the sum of global/generic and spill/stack reads in the + :ref:`address processor `. + + - Instructions per :ref:`normalization unit ` + + * - Write Instructions + + - The number of store instructions submitted to the + :ref:`data-return unit ` by the + :ref:`address processor ` summed over all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. This is expected to be + the sum of global/generic and spill/stack stores counted by the + :ref:`vL1D cache-front-end `. + + - Instructions per :ref:`normalization unit ` + + * - Atomic Instructions + + - The number of atomic instructions submitted to the + :ref:`data-return unit ` by the + :ref:`address processor ` summed over all + :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. This is expected to be + the sum of global/generic and spill/stack atomics in the + :ref:`address processor `. + + - Instructions per :ref:`normalization unit ` + diff --git a/projects/rocprofiler-compute/docs/conf.py b/projects/rocprofiler-compute/docs/conf.py new file mode 100644 index 0000000000..b38ce2e5cf --- /dev/null +++ b/projects/rocprofiler-compute/docs/conf.py @@ -0,0 +1,93 @@ +# MIT License + +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import re + +with open("../VERSION", encoding="utf-8") as f: + match = re.search(r"([0-9.]+)[^0-9.]+", f.read()) + if not match: + raise ValueError("VERSION not found!") + version_number = match[1] + +# project info +project = "Omniperf" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +extensions = ["rocm_docs", "sphinx.ext.extlinks", "sphinxcontrib.datatemplates"] +html_theme = "rocm_docs_theme" +html_theme_options = {"flavor": "rocm"} +html_title = f"{project} {version_number} documentation" +exclude_patterns = ["archive", "*/includes"] + +html_static_path = ["sphinx/static/css"] +html_css_files = ["o_custom.css"] + +external_toc_path = "./sphinx/_toc.yml" +external_projects_current_project = "omniperf" + +# frequently used external resources +extlinks = { + "dev-sample": ("https://github.com/ROCm/omniperf/blob/dev/sample/%s", "%s"), + "prod-page": ( + "https://www.amd.com/en/products/accelerators/instinct/%s.html", + "%s", + ), + "llvm-docs": ("https://llvm.org/docs/AMDGPUUsage.html#%s", "%s"), + "amd-lab-note": ("https://gpuopen.com/learn/amd-lab-notes/%s", "%s"), + "cdna2-white-paper": ( + "https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf#page=%s", + "CDNA2 white paper (page %s)", + ), + "gcn-crash-course": ( + "https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah#%s", + "The AMD GCN Architecture - A Crash Course (slide %s)", + ), + "hip-training-pdf": ( + "https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf#page=%s", + "Introduction to AMD GPU Programming with HIP (slide %s)", + ), + "mantor-gcn-pdf": ( + "https://old.hotchips.org/wp-content/uploads/hc_archives/hc24/HC24-3-ManyCore/HC24.28.315-AMD.GCN.mantor_v1.pdf#page=%s", + "AMD Radeon HD7970 with GCN Architecture (slide %s)", + ), + "mantor-vega10-pdf": ( + "https://old.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.120-Radeon-Vega10-Mantor-AMD-f1.pdf#page=%s", + "AMD Radeon Next Generation GPU Architecture - Vega10 (slide %s)", + ), + "mi200-isa-pdf": ( + "https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf#page=%s", + "AMD Instinct MI200 ISA Reference Guide (page %s)", + ), + "hsa-runtime-pdf": ( + "http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf#page=%s", + "HSA Runtime Programmer's Reference Manual (page %s)", + ), +} diff --git a/projects/rocprofiler-compute/docs/data/analyze/global_variables.png b/projects/rocprofiler-compute/docs/data/analyze/global_variables.png new file mode 100644 index 0000000000..87f49b5e14 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/global_variables.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/Current_and_baseline_dispatch_ids.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/Current_and_baseline_dispatch_ids.png new file mode 100644 index 0000000000..811bf99692 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/Current_and_baseline_dispatch_ids.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/Kernel_time_histogram.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/Kernel_time_histogram.png new file mode 100644 index 0000000000..8ec0fd83ba Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/Kernel_time_histogram.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/Top_bottleneck_dispatches.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/Top_bottleneck_dispatches.png new file mode 100644 index 0000000000..31d13a0a2f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/Top_bottleneck_dispatches.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cpc_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cpc_panel.png new file mode 100644 index 0000000000..7b7f758588 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cpc_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cpf_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cpf_panel.png new file mode 100644 index 0000000000..a43b878536 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cpf_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-arith-ops_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-arith-ops_panel.png new file mode 100644 index 0000000000..073b64d707 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-arith-ops_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-inst-mix_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-inst-mix_panel.png new file mode 100644 index 0000000000..1b9a6d2b25 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-inst-mix_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-mafma-arith-instr-mix_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-mafma-arith-instr-mix_panel.png new file mode 100644 index 0000000000..d74dfd271a Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-mafma-arith-instr-mix_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-pipeline-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-pipeline-stats_panel.png new file mode 100644 index 0000000000..6f572f9148 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-pipeline-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-sol_panel.png new file mode 100644 index 0000000000..8e8f46174f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-value-arith-instr-mix_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-value-arith-instr-mix_panel.png new file mode 100644 index 0000000000..de3750d2d0 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-value-arith-instr-mix_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-vmem-instr-mix_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-vmem-instr-mix_panel.png new file mode 100644 index 0000000000..1d6ce1bc46 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/cu-vmem-instr-mix_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-accesses_panel.png new file mode 100644 index 0000000000..926a7805e7 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-accesses_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-sol_panel.png new file mode 100644 index 0000000000..64be7178c6 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/instr-cache-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-accesses_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-accesses_panel.png new file mode 100644 index 0000000000..101cf77530 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-accesses_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-interface-stalls_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-interface-stalls_panel.png new file mode 100644 index 0000000000..b1bd415ca3 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-interface-stalls_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-transactions_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-transactions_panel.png new file mode 100644 index 0000000000..7df5a78095 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-fabric-transactions_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-per-channel-agg-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-per-channel-agg-stats_panel.png new file mode 100644 index 0000000000..704d45c69f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-per-channel-agg-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-sol_panel.png new file mode 100644 index 0000000000..646e608cbc Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/l2-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-sol_panel.png new file mode 100644 index 0000000000..c261513aa9 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-stats_panel.png new file mode 100644 index 0000000000..0d9d419eb7 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/lds-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/memory-chart_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/memory-chart_panel.png new file mode 100644 index 0000000000..1091a50329 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/memory-chart_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/roofline_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/roofline_panel.png new file mode 100644 index 0000000000..47ee9bddb1 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/roofline_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-cache-accesses_panel.png new file mode 100644 index 0000000000..3605cce8a2 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-cache-accesses_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-l12-interface_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-l12-interface_panel.png new file mode 100644 index 0000000000..5c3480ac9f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-l12-interface_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-sol_panel.png new file mode 100644 index 0000000000..92fa5a1a4a Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/sl1d-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/sol_panel.png new file mode 100644 index 0000000000..f456500e02 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-resource-allocation_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-resource-allocation_panel.png new file mode 100644 index 0000000000..bee869ad10 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-resource-allocation_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-stats_panel.png new file mode 100644 index 0000000000..19c7ad3645 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/spi-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/system-info_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/system-info_panel.png new file mode 100644 index 0000000000..5a5fa01187 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/system-info_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/ta_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/ta_panel.png new file mode 100644 index 0000000000..2f08f9a6b1 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/ta_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/td_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/td_panel.png new file mode 100644 index 0000000000..819407515b Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/td_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/top-stat_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/top-stat_panel.png new file mode 100644 index 0000000000..5e3dddca2f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/top-stat_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-addr-translation_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-addr-translation_panel.png new file mode 100644 index 0000000000..0fb4aaf076 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-addr-translation_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-accesses_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-accesses_panel.png new file mode 100644 index 0000000000..5259b2214f Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-accesses_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-stalls_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-stalls_panel.png new file mode 100644 index 0000000000..61e09c915c Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-cache-stalls_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-l2-transactions_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-l2-transactions_panel.png new file mode 100644 index 0000000000..51875e516c Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-l2-transactions_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-sol_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-sol_panel.png new file mode 100644 index 0000000000..5c2485d0d7 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/vl1d-sol_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-launch-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-launch-stats_panel.png new file mode 100644 index 0000000000..38e4517f33 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-launch-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-runtime-stats_panel.png b/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-runtime-stats_panel.png new file mode 100644 index 0000000000..517d461d31 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/grafana/wavefront-runtime-stats_panel.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/standalone_gui.png b/projects/rocprofiler-compute/docs/data/analyze/standalone_gui.png new file mode 100644 index 0000000000..a8abd81694 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/analyze/standalone_gui.png differ diff --git a/projects/rocprofiler-compute/docs/data/faq/tunnel_demo1.png b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo1.png new file mode 100644 index 0000000000..bda64883c4 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo1.png differ diff --git a/projects/rocprofiler-compute/docs/data/faq/tunnel_demo2.png b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo2.png new file mode 100644 index 0000000000..8b2d258521 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo2.png differ diff --git a/projects/rocprofiler-compute/docs/data/faq/tunnel_demo3.png b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo3.png new file mode 100644 index 0000000000..76cd7ed9a9 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/faq/tunnel_demo3.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/datasource_config.jpg b/projects/rocprofiler-compute/docs/data/install/datasource_config.jpg new file mode 100644 index 0000000000..4210d9036b Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/datasource_config.jpg differ diff --git a/projects/rocprofiler-compute/docs/data/install/datasource_settings.jpg b/projects/rocprofiler-compute/docs/data/install/datasource_settings.jpg new file mode 100644 index 0000000000..f472362544 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/datasource_settings.jpg differ diff --git a/projects/rocprofiler-compute/docs/data/install/grafana_welcome.png b/projects/rocprofiler-compute/docs/data/install/grafana_welcome.png new file mode 100644 index 0000000000..e564c0a389 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/grafana_welcome.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/grafana_workload_selection.png b/projects/rocprofiler-compute/docs/data/install/grafana_workload_selection.png new file mode 100644 index 0000000000..3ecdc35e72 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/grafana_workload_selection.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/import_dashboard.png b/projects/rocprofiler-compute/docs/data/install/import_dashboard.png new file mode 100644 index 0000000000..29be7ea584 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/import_dashboard.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/install-decision-tree.png b/projects/rocprofiler-compute/docs/data/install/install-decision-tree.png new file mode 100644 index 0000000000..1c62fba87b Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/install-decision-tree.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/omniperf_server_vs_client_install.png b/projects/rocprofiler-compute/docs/data/install/omniperf_server_vs_client_install.png new file mode 100644 index 0000000000..8c43dba9e2 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/omniperf_server_vs_client_install.png differ diff --git a/projects/rocprofiler-compute/docs/data/install/opening_dashboard.png b/projects/rocprofiler-compute/docs/data/install/opening_dashboard.png new file mode 100644 index 0000000000..5e6c7ea625 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/install/opening_dashboard.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/fabric.png b/projects/rocprofiler-compute/docs/data/performance-model/fabric.png new file mode 100644 index 0000000000..826b4d9de7 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/fabric.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/fabric.svg b/projects/rocprofiler-compute/docs/data/performance-model/fabric.svg new file mode 100644 index 0000000000..516854843a --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/fabric.svg @@ -0,0 +1,899 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Total Fabric Requests + + + + + 32B Read Requests + + + + 64B Read Requests + + + + 32B Write Requests + + + + + + 64B Write Requests + + + + + + Uncached Read Requests + + + x2 + + + + Uncached Write Requests + + + + + + Atomic +Requests + + + + + + HBM Read +Requests + + + + + Remote Read +Requests + + + + + + + + + + + + + + + + + + + HBM Write Requests + + + + Remote Write Requests + + + + diff --git a/projects/rocprofiler-compute/docs/data/performance-model/gcn_compute_unit.png b/projects/rocprofiler-compute/docs/data/performance-model/gcn_compute_unit.png new file mode 100644 index 0000000000..e6c1f2eb07 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/gcn_compute_unit.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.png b/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.png new file mode 100644 index 0000000000..fdabfbb955 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.svg b/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.svg new file mode 100644 index 0000000000..dd22a71319 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/l1perf_model.svg @@ -0,0 +1,584 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Compute Unit + Cmd/Data + + + + Address Processing Unit + + + Sync + Data Processing Unit + + Virtual To Physical Address Translation + + Tag RAM + + L1 Cache Controller + + CacheRAM + + L2 Memory Interface + Data + + Bus + + L2 Cache + + + + diff --git a/projects/rocprofiler-compute/docs/data/performance-model/lds.png b/projects/rocprofiler-compute/docs/data/performance-model/lds.png new file mode 100644 index 0000000000..f444eaf539 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/lds.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/lds.svg b/projects/rocprofiler-compute/docs/data/performance-model/lds.svg new file mode 100644 index 0000000000..c0adb5e912 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/lds.svg @@ -0,0 +1,393 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SIMD 0/1 + SIMD 2/3 + + + + + + Conflict Detection + + + + Scheduler + + + + Bank 0 + + + + Bank 1 + + + + Bank 2 + + + + Bank 3 + + + + Bank 31 + + ... + + diff --git a/projects/rocprofiler-compute/docs/data/performance-model/nosplit.png b/projects/rocprofiler-compute/docs/data/performance-model/nosplit.png new file mode 100644 index 0000000000..a8e5f01649 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/nosplit.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/nosplit.svg b/projects/rocprofiler-compute/docs/data/performance-model/nosplit.svg new file mode 100644 index 0000000000..d0d9606be5 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/nosplit.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + diff --git a/projects/rocprofiler-compute/docs/data/performance-model/selayout.png b/projects/rocprofiler-compute/docs/data/performance-model/selayout.png new file mode 100644 index 0000000000..73aa2b49de Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/selayout.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/split.png b/projects/rocprofiler-compute/docs/data/performance-model/split.png new file mode 100644 index 0000000000..cca71eb2a4 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/split.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/split.svg b/projects/rocprofiler-compute/docs/data/performance-model/split.svg new file mode 100644 index 0000000000..b033a9e111 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/split.svg @@ -0,0 +1,64 @@ + + + + + + + + + + + diff --git a/projects/rocprofiler-compute/docs/data/performance-model/uncached.png b/projects/rocprofiler-compute/docs/data/performance-model/uncached.png new file mode 100644 index 0000000000..f770a1b291 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/performance-model/uncached.png differ diff --git a/projects/rocprofiler-compute/docs/data/performance-model/uncached.svg b/projects/rocprofiler-compute/docs/data/performance-model/uncached.svg new file mode 100644 index 0000000000..53affd4fc6 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/performance-model/uncached.svg @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + x2 + + diff --git a/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.png b/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.png new file mode 100644 index 0000000000..2deaba7ad2 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.png differ diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.png b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.png new file mode 100644 index 0000000000..bd74d62499 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.png differ diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.svg b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.svg new file mode 100644 index 0000000000..a854f697de --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsbandwidth.svg @@ -0,0 +1,1579 @@ + + + + + + + + 2023-08-21T11:00:20.650499 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.png b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.png new file mode 100644 index 0000000000..ab057f3cd9 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.png differ diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.svg b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.svg new file mode 100644 index 0000000000..f98e9bc4a6 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflictrate.svg @@ -0,0 +1,1050 @@ + + + + + + + + 2023-08-21T11:43:04.336525 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.png b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.png new file mode 100644 index 0000000000..77c0938581 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.png differ diff --git a/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.svg b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.svg new file mode 100644 index 0000000000..f4a2f17d14 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/profiling-by-example/ldsconflicts.svg @@ -0,0 +1,1145 @@ + + + + + + + + 2023-08-17T18:14:36.907658 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rocprofiler-compute/docs/data/unused/L1_l2_transactions_per_channel.png b/projects/rocprofiler-compute/docs/data/unused/L1_l2_transactions_per_channel.png new file mode 100644 index 0000000000..7b839ab0fe Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/L1_l2_transactions_per_channel.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/L2_ea_latencies_per_channel.png b/projects/rocprofiler-compute/docs/data/unused/L2_ea_latencies_per_channel.png new file mode 100644 index 0000000000..a0b3471974 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/L2_ea_latencies_per_channel.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/L2_ea_stalls_per_channel.png b/projects/rocprofiler-compute/docs/data/unused/L2_ea_stalls_per_channel.png new file mode 100644 index 0000000000..ac1c5dffb1 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/L2_ea_stalls_per_channel.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_stalls_per_channel.png b/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_stalls_per_channel.png new file mode 100644 index 0000000000..d5a1c2c072 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_stalls_per_channel.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_starvation_per_channel.png b/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_starvation_per_channel.png new file mode 100644 index 0000000000..49d584621d Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/L2_ea_write_starvation_per_channel.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/Memory_latencies.png b/projects/rocprofiler-compute/docs/data/unused/Memory_latencies.png new file mode 100644 index 0000000000..3b97d72e0d Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/Memory_latencies.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/Roofline_analysis.png b/projects/rocprofiler-compute/docs/data/unused/Roofline_analysis.png new file mode 100644 index 0000000000..36efd2ea77 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/Roofline_analysis.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/Top_bottleneck_kernels.png b/projects/rocprofiler-compute/docs/data/unused/Top_bottleneck_kernels.png new file mode 100644 index 0000000000..17b8ef7da2 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/Top_bottleneck_kernels.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/fig_level_counter.png b/projects/rocprofiler-compute/docs/data/unused/fig_level_counter.png new file mode 100644 index 0000000000..fa50539a0c Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/fig_level_counter.png differ diff --git a/projects/rocprofiler-compute/docs/data/unused/omniperf_architecture.png b/projects/rocprofiler-compute/docs/data/unused/omniperf_architecture.png new file mode 100644 index 0000000000..966ac2d608 Binary files /dev/null and b/projects/rocprofiler-compute/docs/data/unused/omniperf_architecture.png differ diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst new file mode 100644 index 0000000000..f76e3970fc --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst @@ -0,0 +1,378 @@ +.. meta:: + :description: Omniperf analysis: CLI analysis + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, command line, analyze, filtering, metrics, baseline, comparison + +************ +CLI analysis +************ + +This section provides an overview of Omniperf's CLI analysis features. + +* :ref:`Derived metrics `: All of Omniperf's built-in metrics. + +* :ref:`Baseline comparison `: Compare multiple + runs in a side-by-side manner. + +* :ref:`Metric customization `: Isolate a subset of + built-in metrics or build your own profiling configuration. + +* :ref:`Filtering `: Hone in on a particular kernel, + GPU ID, or dispatch ID via post-process filtering. + +Run ``omniperf analyze -h`` for more details. + +.. _cli-walkthrough: + +Walkthrough +=========== + +1. To begin, generate a high-level analysis report using Omniperf's ``-b`` (or ``--block``) flag. + + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ -b 2 + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Analysis mode = cli + [analysis] deriving Omniperf metrics... + + -------------------------------------------------------------------------------- + 0. Top Stats + 0.1 Top Kernels + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ Kernel_Name │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ vecCopy(double*, double*, double*, int, │ 1.00 │ 20160.00 │ 20160.00 │ 20160.00 │ 100.00 │ + │ │ int) [clone .kd] │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + 0.2 Dispatch List + ╒════╤═══════════════╤══════════════════════════════════════════════════════════╤══════════╕ + │ │ Dispatch_ID │ Kernel_Name │ GPU_ID │ + ╞════╪═══════════════╪══════════════════════════════════════════════════════════╪══════════╡ + │ 0 │ 0 │ vecCopy(double*, double*, double*, int, int) [clone .kd] │ 0 │ + ╘════╧═══════════════╧══════════════════════════════════════════════════════════╧══════════╛ + + + -------------------------------------------------------------------------------- + 2. System Speed-of-Light + 2.1 Speed-of-Light + ╒═════════════╤═══════════════════════════╤═════════╤══════════════════╤══════════╤═══════════════╕ + │ Metric_ID │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ + ╞═════════════╪═══════════════════════════╪═════════╪══════════════════╪══════════╪═══════════════╡ + │ 2.1.0 │ VALU FLOPs │ 0.0 │ Gflop │ 22630.4 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.1 │ VALU IOPs │ 364.09 │ Giop │ 22630.4 │ 1.61 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.2 │ MFMA FLOPs (BF16) │ 0.0 │ Gflop │ 181043.2 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.3 │ MFMA FLOPs (F16) │ 0.0 │ Gflop │ 181043.2 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.4 │ MFMA FLOPs (F32) │ 0.0 │ Gflop │ 45260.8 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.5 │ MFMA FLOPs (F64) │ 0.0 │ Gflop │ 45260.8 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.6 │ MFMA IOPs (Int8) │ 0.0 │ Giop │ 181043.2 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.7 │ Active CUs │ 70.0 │ Cus │ 104.0 │ 67.31 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.8 │ SALU Utilization │ 3.78 │ Pct │ 100.0 │ 3.78 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.9 │ VALU Utilization │ 5.4 │ Pct │ 100.0 │ 5.4 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.10 │ MFMA Utilization │ 0.0 │ Pct │ 100.0 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.11 │ VMEM Utilization │ 1.08 │ Pct │ 100.0 │ 1.08 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.12 │ Branch Utilization │ 1.08 │ Pct │ 100.0 │ 1.08 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.13 │ VALU Active Threads │ 64.0 │ Threads │ 64.0 │ 100.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.14 │ IPC │ 0.21 │ Instr/cycle │ 5.0 │ 4.13 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.15 │ Wavefront Occupancy │ 2488.86 │ Wavefronts │ 3328.0 │ 74.79 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.16 │ Theoretical LDS Bandwidth │ 0.0 │ Gb/s │ 22630.4 │ 0.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.17 │ LDS Bank Conflicts/Access │ │ Conflicts/access │ 32.0 │ │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.18 │ vL1D Cache Hit Rate │ 50.0 │ Pct │ 100.0 │ 50.0 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.19 │ vL1D Cache BW │ 1664.41 │ Gb/s │ 11315.2 │ 14.71 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.20 │ L2 Cache Hit Rate │ 35.74 │ Pct │ 100.0 │ 35.74 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.21 │ L2 Cache BW │ 1296.31 │ Gb/s │ 3481.6 │ 37.23 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.22 │ L2-Fabric Read BW │ 416.52 │ Gb/s │ 1638.4 │ 25.42 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.23 │ L2-Fabric Write BW │ 292.3 │ Gb/s │ 1638.4 │ 17.84 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.24 │ L2-Fabric Read Latency │ 262.85 │ Cycles │ │ │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.25 │ L2-Fabric Write Latency │ 307.4 │ Cycles │ │ │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.26 │ sL1D Cache Hit Rate │ 99.82 │ Pct │ 100.0 │ 99.82 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.27 │ sL1D Cache BW │ 208.05 │ Gb/s │ 6092.8 │ 3.41 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.28 │ L1I Hit Rate │ 99.91 │ Pct │ 100.0 │ 99.91 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.29 │ L1I BW │ 208.05 │ Gb/s │ 6092.8 │ 3.41 │ + ├─────────────┼───────────────────────────┼─────────┼──────────────────┼──────────┼───────────────┤ + │ 2.1.30 │ L1I Fetch Latency │ 20.86 │ Cycles │ │ │ + ╘═════════════╧═══════════════════════════╧═════════╧══════════════════╧══════════╧═══════════════╛ + + ... + +.. _cli-list-metrics: + +2. Use ``--list-metrics`` to generate a list of available metrics for inspection. + + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ --list-metrics gfx90a + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Analysis mode = cli + [analysis] deriving Omniperf metrics... + 0 -> Top Stats + 1 -> System Info + 2 -> System Speed-of-Light + 2.1 -> Speed-of-Light + 2.1.0 -> VALU FLOPs + 2.1.1 -> VALU IOPs + 2.1.2 -> MFMA FLOPs (BF16) + 2.1.3 -> MFMA FLOPs (F16) + 2.1.4 -> MFMA FLOPs (F32) + 2.1.5 -> MFMA FLOPs (F64) + 2.1.6 -> MFMA IOPs (Int8) + 2.1.7 -> Active CUs + 2.1.8 -> SALU Utilization + 2.1.9 -> VALU Utilization + 2.1.10 -> MFMA Utilization + 2.1.11 -> VMEM Utilization + 2.1.12 -> Branch Utilization + 2.1.13 -> VALU Active Threads + 2.1.14 -> IPC + 2.1.15 -> Wavefront Occupancy + 2.1.16 -> Theoretical LDS Bandwidth + 2.1.17 -> LDS Bank Conflicts/Access + 2.1.18 -> vL1D Cache Hit Rate + 2.1.19 -> vL1D Cache BW + 2.1.20 -> L2 Cache Hit Rate + 2.1.21 -> L2 Cache BW + 2.1.22 -> L2-Fabric Read BW + 2.1.23 -> L2-Fabric Write BW + 2.1.24 -> L2-Fabric Read Latency + 2.1.25 -> L2-Fabric Write Latency + 2.1.26 -> sL1D Cache Hit Rate + 2.1.27 -> sL1D Cache BW + 2.1.28 -> L1I Hit Rate + 2.1.29 -> L1I BW + 2.1.30 -> L1I Fetch Latency + ... + +3. Choose your own customized subset of metrics with the ``-b`` (or ``--block``) + option. Or, build your own configuration following + `config_template `_. + The following snippet shows how to generate a report containing only metric 2 + (:doc:`System Speed-of-Light `). + + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ -b 2 + + -------- + Analyze + -------- + + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ vecCopy(double*, double*, double*, int, │ 1 │ 20000.00 │ 20000.00 │ 20000.00 │ 100.00 │ + │ │ int) [clone .kd] │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 2. System Speed-of-Light + ╒═════════╤═══════════════════════════╤═══════════════════════╤══════════════════╤════════════════════╤════════════════════════╕ + │ Index │ Metric │ Value │ Unit │ Peak │ PoP │ + ╞═════════╪═══════════════════════════╪═══════════════════════╪══════════════════╪════════════════════╪════════════════════════╡ + │ 2.1.0 │ VALU FLOPs │ 0.0 │ Gflop │ 22630.4 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.1 │ VALU IOPs │ 367.0016 │ Giop │ 22630.4 │ 1.6217194570135745 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.2 │ MFMA FLOPs (BF16) │ 0.0 │ Gflop │ 90521.6 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.3 │ MFMA FLOPs (F16) │ 0.0 │ Gflop │ 181043.2 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.4 │ MFMA FLOPs (F32) │ 0.0 │ Gflop │ 45260.8 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.5 │ MFMA FLOPs (F64) │ 0.0 │ Gflop │ 45260.8 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.6 │ MFMA IOPs (Int8) │ 0.0 │ Giop │ 181043.2 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.7 │ Active CUs │ 74 │ Cus │ 104 │ 71.15384615384616 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.8 │ SALU Util │ 4.016057506716307 │ Pct │ 100 │ 4.016057506716307 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.9 │ VALU Util │ 5.737225009594725 │ Pct │ 100 │ 5.737225009594725 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.10 │ MFMA Util │ 0.0 │ Pct │ 100 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.11 │ VALU Active Threads/Wave │ 64.0 │ Threads │ 64 │ 100.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.12 │ IPC - Issue │ 1.0 │ Instr/cycle │ 5 │ 20.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.13 │ LDS BW │ 0.0 │ Gb/sec │ 22630.4 │ 0.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.14 │ LDS Bank Conflict │ │ Conflicts/access │ 32 │ │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.15 │ Instr Cache Hit Rate │ 99.91306912556854 │ Pct │ 100 │ 99.91306912556854 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.16 │ Instr Cache BW │ 209.7152 │ Gb/s │ 6092.8 │ 3.442016806722689 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.17 │ Scalar L1D Cache Hit Rate │ 99.81986908342313 │ Pct │ 100 │ 99.81986908342313 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.18 │ Scalar L1D Cache BW │ 209.7152 │ Gb/s │ 6092.8 │ 3.442016806722689 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.19 │ Vector L1D Cache Hit Rate │ 50.0 │ Pct │ 100 │ 50.0 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.20 │ Vector L1D Cache BW │ 1677.7216 │ Gb/s │ 11315.199999999999 │ 14.82714932126697 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.21 │ L2 Cache Hit Rate │ 35.55067615693325 │ Pct │ 100 │ 35.55067615693325 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.22 │ L2-Fabric Read BW │ 419.8496 │ Gb/s │ 1638.4 │ 25.6255859375 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.23 │ L2-Fabric Write BW │ 293.9456 │ Gb/s │ 1638.4 │ 17.941015625 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.24 │ L2-Fabric Read Latency │ 256.6482321288385 │ Cycles │ │ │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.25 │ L2-Fabric Write Latency │ 317.2264255699014 │ Cycles │ │ │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.26 │ Wave Occupancy │ 1821.723057333852 │ Wavefronts │ 3328 │ 54.73927455931046 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.27 │ Instr Fetch BW │ 4.174722306564298e-08 │ Gb/s │ 3046.4 │ 1.3703789084047721e-09 │ + ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤ + │ 2.1.28 │ Instr Fetch Latency │ 21.729248046875 │ Cycles │ │ │ + ╘═════════╧═══════════════════════════╧═══════════════════════╧══════════════════╧════════════════════╧════════════════════════╛ + + .. note:: + + Some cells may be blank indicating a missing or unavailable hardware + counter or NULL value. + +4. Optimize the application, iterate, and re-profile to inspect performance + changes. + +5. Redo a comprehensive analysis with Omniperf CLI at any optimization + milestone. + +.. _cli-analysis-options: + +More analysis options +===================== + +Single run + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ + +List top kernels and dispatches + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ --list-stats + +List metrics + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ --list-metrics gfx90a + +Show System Speed-of-Light and CS_Busy blocks only + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ -b 2 5.1.0 + +.. note:: + + You can filter a single metric or the whole hardware component by its ID. In + this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for + GPU Busy Cycles metric. + +Filter kernels + First, list the top kernels in your application using `--list-stats`. + + .. code-block:: + + $ omniperf analyze -p workloads/vcopy/MI200/ --list-stats + + Analysis mode = cli + [analysis] deriving Omniperf metrics... + + -------------------------------------------------------------------------------- + Detected Kernels (sorted descending by duration) + ╒════╤══════════════════════════════════════════════╕ + │ │ Kernel_Name │ + ╞════╪══════════════════════════════════════════════╡ + │ 0 │ vecCopy(double*, double*, double*, int, int) │ + ╘════╧══════════════════════════════════════════════╛ + + -------------------------------------------------------------------------------- + Dispatch list + ╒════╤═══════════════╤══════════════════════════════════════════════╤══════════╕ + │ │ Dispatch_ID │ Kernel_Name │ GPU_ID │ + ╞════╪═══════════════╪══════════════════════════════════════════════╪══════════╡ + │ 0 │ 0 │ vecCopy(double*, double*, double*, int, int) │ 0 │ + ╘════╧═══════════════╧══════════════════════════════════════════════╧══════════╛ + + Second, select the index of the kernel you would like to filter; for example, + ``vecCopy(double*, double*, double*, int, int) [clone .kd]`` at index ``0``. + Then, use this index to apply the filter via ``-k`` or ``--kernels``. + + .. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ -k 0 + + Analysis mode = cli + [analysis] deriving Omniperf metrics... + + -------------------------------------------------------------------------------- + 0. Top Stats + 0.1 Top Kernels + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╤═════╕ + │ │ Kernel_Name │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ S │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╪═════╡ + │ 0 │ vecCopy(double*, double*, double*, int, │ 1.00 │ 18560.00 │ 18560.00 │ 18560.00 │ 100.00 │ * │ + │ │ int) │ │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╧═════╛ + ... + + You should see your filtered kernels indicated by an asterisk in the **Top + Stats** table. + + +Baseline comparison + .. code-block:: shell + + omniperf analyze -p workload1/path/ -p workload2/path/ + + OR + + .. code-block:: shell + + omniperf analyze -p workload1/path/ -k 0 -p workload2/path/ -k 1 diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/grafana-gui.rst b/projects/rocprofiler-compute/docs/how-to/analyze/grafana-gui.rst new file mode 100644 index 0000000000..d5474aefbd --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/analyze/grafana-gui.rst @@ -0,0 +1,1071 @@ +.. meta:: + :description: Omniperf analysis: Grafana GUI + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, Grafana, panels, GUI, import + +******************** +Grafana GUI analysis +******************** + +Find setup instructions in :doc:`../../install/grafana-setup`. + +The Omniperf Grafana analysis dashboard GUI supports the following features to +facilitate MI accelerator performance profiling and analysis: + +* System and hardware component (hardware block) + +* Speed-of-Light (SOL) + +* Multiple normalization options + +* Baseline comparisons + +* Regex-based dispatch ID filtering + +* Roofline analysis + +* Detailed performance counters and metrics per hardware component, such as: + + * Command Processor - Fetch (CPF) / Command Processor - Controller (CPC) + + * Workgroup Manager (SPI) + + * Shader Sequencer (SQ) + + * Shader Sequencer Controller (SQC) + + * L1 Address Processing Unit, a.k.a. Texture Addresser (TA) / L1 Backend Data + Processing Unit, a.k.a. Texture Data (TD) + + * L1 Cache (TCP) + + * L2 Cache (TCC) (both aggregated and per-channel perf info) + +See the full list of :ref:`Omniperf's analysis panels `. + +.. _analysis-sol: + +Speed-of-Light +-------------- + +Speed-of-Light panels are provided at both the system and per hardware component +level to help diagnosis performance bottlenecks. The performance numbers of the +workload under testing are compared to the theoretical maximum, such as floating +point operations, bandwidth, cache hit rate, etc., to indicate the available +room to further utilize the hardware capability. + +.. _analysis-normalizations: + +Normalizations +-------------- + +Multiple performance number normalizations are provided to allow performance +inspection within both hardware and software context. The following +normalizations are available. + +* ``per_wave`` + +* ``per_cycle`` + +* ``per_kernel`` + +* ``per_second`` + +See :ref:`normalization-units` to learn more about Omniperf normalizations. + +.. _analysis-baseline-comparison: + +Baseline comparison +------------------- + +Omniperf enables baseline comparison to allow checking A/B effect. Currently +baseline comparison is limited to the same :ref:`SoC `. Cross +comparison between SoCs is in development. + +For both the Current Workload and the Baseline Workload, you can independently +setup the following filters to allow fine grained comparisons: + +* Workload Name + +* GPU ID filtering (multi-selection) + +* Kernel Name filtering (multi-selection) + +* Dispatch ID filtering (regex filtering) + +* Omniperf Panels (multi-selection) + +.. _analysis-regex-dispatch-id: + +Regex-based dispatch ID filtering +--------------------------------- + +Omniperf allows filtering via Regular Expressions (regex), a standard Linux +string matching syntax, based dispatch ID filtering to flexibly choose the +kernel invocations. + +For example, to inspect Dispatch Range from 17 to 48, inclusive, the +corresponding regex is : ``(1[7-9]|[23]\d|4[0-8])``. + +.. tip:: + + Try `Regex Numeric Range Generator `_ for help + generating typical number ranges. + +.. _analysis-incremental-profiling: + +Incremental profiling +--------------------- + +Omniperf supports incremental profiling to speed up performance analysis. + +Refer to the :ref:`profiling-hw-component-filtering` section for this command. + +By default, the entire application is profiled to collect performance counters +for all hardware blocks, giving a complete view of where the workload stands in +terms of performance optimization opportunities and bottlenecks. + +You can choose to focus on only a few hardware components -- for example L1 +cache or LDS -- to closely check the effect of software optimizations, without +performing application replay for *all* other hardware components. This saves +a lot of compute time. In addition, prior profiling results for other hardware +components are not overwritten; instead, they can be merged during the import to +piece together an overall profile of the system. + +.. _analysis-color-coding: + +Color coding +------------ + +Uniform color coding applies to most visualizations -- including bar graphs, +tables, and diagrams -- for easy inspection. As a rule of thumb, *yellow* means +over 50%, while *red* means over 90% percent. + +Global variables and configurations +----------------------------------- + +.. image:: ../../data/analyze/global_variables.png + :align: center + :alt: Omniperf global variables and configurations + :width: 800 + +.. _grafana-gui-import: + +Grafana GUI import +------------------ + +The Omniperf database ``--import`` option imports the raw profiling data to +Grafana's backend MongoDB database. This step is only required for Grafana +GUI-based performance analysis. + +Default username and password for MongoDB (to be used in database mode) are as +follows: + +* **Username**: ``temp`` + +* **Password**: ``temp123`` + +Each workload is imported to a separate database with the following naming +convention: + +.. code-block:: shell + + omniperf___ + +For example: + +.. code-block:: shell + + omniperf_asw_vcopy_mi200 + +When using :ref:`database mode `, be sure to tailor the +connection options to the machine hosting your +:doc:`server-side instance `. Below is the sample +command to import the *vcopy* profiling data, assuming our host machine is +called ``dummybox``. + +.. _grafana-gui-remove: + +.. code-block:: shell-session + + $ omniperf database --help + usage: + + omniperf database [connection options] + + + + ------------------------------------------------------------------------------- + + Examples: + + omniperf database --import -H pavii1 -u temp -t asw -w workloads/vcopy/mi200/ + + omniperf database --remove -H pavii1 -u temp -w omniperf_asw_sample_mi200 + + ------------------------------------------------------------------------------- + + + + Help: + -h, --help show this help message and exit + + General Options: + -v, --version show program's version number and exit + -V, --verbose Increase output verbosity (use multiple times for higher levels) + -s, --specs Print system specs. + + Interaction Type: + -i, --import Import workload to Omniperf DB + -r, --remove Remove a workload from Omniperf DB + + Connection Options: + -H , --host Name or IP address of the server host. + -P , --port TCP/IP Port. (DEFAULT: 27018) + -u , --username Username for authentication. + -p , --password The user's password. (will be requested later if it's not set) + -t , --team Specify Team prefix. + -w , --workload Specify name of workload (to remove) or path to workload (to import) + --kernel-verbose Specify Kernel Name verbose level 1-5. Lower the level, shorter the kernel name. (DEFAULT: 5) (DISABLE: 5) + + +Omniperf import for vcopy: +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ omniperf database --import -H dummybox -u temp -t asw -w workloads/vcopy/mi200/ + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + + Pulling data from /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + The directory exists + Found sysinfo file + KernelName shortening enabled + Kernel name verbose level: 2 + Password: + Password received + -- Conversion & Upload in Progress -- + 0%| | 0/11 [00:00` + + - Kernel time histogram + + - Top ten bottleneck kernels + +- :ref:`System Speed-of-Light ` + + - Speed-of-Light + + - System Info table + +- :ref:`Memory Chart Analysis ` + +- :ref:`Roofline Analysis ` + + - FP32/FP64 + + - FP16/INT8 + +- :ref:`Command Processor ` + + - Command Processor - Fetch (CPF) + + - Command Processor - Controller (CPC) + +- :ref:`Workgroup Manager or Shader Processor Input (SPI) ` + + - SPI Stats + + - SPI Resource Allocations + +- :ref:`Wavefront Launch ` + + - Wavefront Launch Stats + + - Wavefront runtime stats + + - per-SE Wavefront Scheduling performance + +- :ref:`Wavefront Lifetime ` + + - Wavefront lifetime breakdown + + - per-SE wavefront life (average) + + - per-SE wavefront life (histogram) + +- :ref:`Wavefront Occupancy ` + + - per-SE wavefront occupancy + + - per-CU wavefront occupancy + +- :ref:`Compute Unit - Instruction Mix ` + + - per-wave Instruction mix + + - per-wave VALU Arithmetic instruction mix + + - per-wave MFMA Arithmetic instruction mix + +- :ref:`Compute Unit - Compute Pipeline ` + + - Speed-of-Light: Compute Pipeline + + - Arithmetic OPs count + + - Compute pipeline stats + + - Memory latencies + +- :ref:`Local Data Share (LDS) ` + + - Speed-of-Light: LDS + + - LDS stats + +- :ref:`Instruction Cache ` + + - Speed-of-Light: Instruction Cache + + - Instruction Cache Accesses + +- Constant Cache + + - Speed-of-Light: Constant Cache + + - Constant Cache Accesses + + - Constant Cache - L2 Interface stats + +- :ref:`Texture Addresser and Texture Data ` + + - Texture Addresser (TA) + + - Texture Data (TD) + +- L1 Cache + + - Speed-of-Light: L1 Cache + + - L1 Cache Accesses + + - L1 Cache Stalls + + - L1 - L2 Transactions + + - L1 - UTCL1 Interface stats + +- :ref:`L2 Cache ` + + - Speed-of-Light: L2 Cache + + - L2 Cache Accesses + + - L2 - EA Transactions + + - L2 - EA Stalls + +- :ref:`L2 Cache Per Channel Performance ` + + - Per-channel L2 Hit rate + + - Per-channel L1-L2 Read requests + + - Per-channel L1-L2 Write Requests + + - Per-channel L1-L2 Atomic Requests + + - Per-channel L2-EA Read requests + + - Per-channel L2-EA Write requests + + - Per-channel L2-EA Atomic requests + + - Per-channel L2-EA Read latency + + - Per-channel L2-EA Write latency + + - Per-channel L2-EA Atomic latency + + - Per-channel L2-EA Read stall (I/O, GMI, HBM) + + - Per-channel L2-EA Write stall (I/O, GMI, HBM, Starve) + +Most panels are designed around a specific hardware component block to +thoroughly understand its behavior. Additional panels, including custom panels, +could also be added to aid the performance analysis. + +.. _grafana-panel-sys-info: + +System Info +^^^^^^^^^^^ + +.. figure:: ../../data/analyze/grafana/system-info_panel.png + :align: center + :alt: System details logged from the host machine + :width: 800 + + System details logged from the host machine. + +.. _grafana-panel-kernel-stats: + +Kernel Statistics +^^^^^^^^^^^^^^^^^ + +Kernel Time Histogram ++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/Kernel_time_histogram.png + :align: center + :alt: Kernel time histogram panel in Omniperf Grafana + :width: 800 + + Mapping application kernel launches to execution duration. + +Top Bottleneck Kernels +++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/top-stat_panel.png + :align: center + :alt: Top bottleneck kernels panel in Omniperf Grafana + :width: 800 + + Top N kernels and relevant statistics. Sorted by total duration. + +Top Bottleneck Dispatches ++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/Top_bottleneck_dispatches.png + :align: center + :alt: Top bottleneck dispatches panel in Omniperf Grafana + :width: 800 + + Top N kernel dispatches and relevant statistics. Sorted by total duration. + +Current and Baseline Dispatch IDs (Filtered) +++++++++++++++++++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/Current_and_baseline_dispatch_ids.png + :align: center + :alt: Current and baseline dispatch IDs panel in Omniperf Grafana + :width: 800 + + List of all kernel dispatches. + +.. _grafana-panel-system-sol: + +System Speed-of-Light +^^^^^^^^^^^^^^^^^^^^^ + +.. figure:: ../../data/analyze/grafana/sol_panel.png + :align: center + :alt: System Speed-of-Light panel in Omniperf Grafana + :width: 800 + + Key metrics from various sections of Omniperf’s profiling report. + +.. tip:: + + See :doc:`/conceptual/system-speed-of-light` to learn about reported metrics. + +.. _grafana-panel-memory-chart-analysis: + +Memory Chart Analysis +^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + The Memory Chart Analysis support multiple normalizations. Due to limited + space, all transactions, when normalized to ``per_sec``, default to unit of + billion transactions per second. + +.. figure:: ../../data/analyze/grafana/memory-chart_panel.png + :align: center + :alt: Memory Chart Analysis panel in Omniperf Grafana + :width: 800 + + A graphical representation of performance data for memory blocks on the GPU. + + +.. _grafana-panel-roofline-analysis: + +Empirical Roofline Analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. figure:: ../../data/analyze/grafana/roofline_panel.png + :align: center + :alt: Roofline Analysis panel in Omniperf Grafana + :width: 800 + + Visualize achieved performance relative to a benchmarked peak performance. + + +.. _grafana-panel-cp: + +Command Processor +^^^^^^^^^^^^^^^^^ + +.. tip:: + + See :doc:`/conceptual/command-processor` to learn about reported metrics. + +Command Processor Fetcher ++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cpc_panel.png + :align: center + :alt: Command Processor Fetcher panel in Omniperf Grafana + :width: 800 + + Fetches commands out of memory to hand them over to the Command Processor + Fetcher (CPC) for processing + +Command Processor Compute ++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cpf_panel.png + :align: center + :alt: Command Processor Compute panel in Omniperf Grafana + :width: 800 + + The micro-controller running the command processing firmware that decodes the + fetched commands, and (for kernels) passes them to the Workgroup Managers + (SPIs) for scheduling. + +.. _grafana-panel-spi: + +Shader Processor Input (SPI) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. tip:: + + See :ref:`desc-spi` to learn about reported metrics. + +SPI Stats ++++++++++ + +.. figure:: ../../data/analyze/grafana/spi-stats_panel.png + :align: center + :alt: SPI Stats panel in Omniperf Grafana + :width: 800 + +.. + TODO: Add caption after merge + +SPI Resource Allocation ++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/spi-resource-allocation_panel.png + :align: center + :alt: SPI Resource Allocation panel in Omniperf Grafana + :width: 800 + +.. + TODO: Add caption after merge + +.. _grafana-panel-wavefront: + +Wavefront +^^^^^^^^^ + +Wavefront Launch Stats +++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/wavefront-launch-stats_panel.png + :align: center + :alt: Wavefront Launch Stats panel in Omniperf Grafana + :width: 800 + + General information about the kernel launch. + +.. tip:: + + See :ref:`wavefront-launch-stats` to learn about reported metrics. + +Wavefront Runtime Stats ++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/wavefront-runtime-stats_panel.png + :align: center + :alt: Wavefront Runtime Stats panel in Omniperf Grafana. + :width: 800 + + High-level overview of the execution of wavefronts in a kernel. + +.. tip:: + + See :ref:`wavefront-runtime-stats` to learn about reported metrics. + +.. _grafana-panel-cu-instruction-mix: + +Compute Unit - Instruction Mix +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Instruction Mix ++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-inst-mix_panel.png + :align: center + :alt: Instruction Mix panel in Omniperf Grafana + :width: 800 + + Breakdown of the various types of instructions executed by the user’s kernel, + and which pipelines on the Compute Unit (CU) they were executed on. + +.. tip:: + + See :ref:`instruction-mix` to learn about reported metrics. + +VALU Arithmetic Instruction Mix ++++++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-value-arith-instr-mix_panel.png + :align: center + :alt: VALU Arithmetic Instruction Mix panel in Omniperf Grafana + :width: 800 + + The various types of vector instructions that were issued to the vector + arithmetic logic unit (VALU). + +.. tip:: + + See :ref:`valu-arith-instruction-mix` to learn about reported metrics. + +MFMA Arithmetic Instruction Mix ++++++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-mafma-arith-instr-mix_panel.png + :align: center + :alt: MFMA Arithmetic Instruction Mix panel in Omniperf Grafana + :width: 800 + + The types of Matrix Fused Multiply-Add (MFMA) instructions that were issued. + +.. tip:: + + See :ref:`mfma-instruction-mix` to learn about reported metrics. + +VMEM Arithmetic Instruction Mix ++++++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-vmem-instr-mix_panel.png + :align: center + :alt: VMEM Arithmetic Instruction Mix panel in Omniperf Grafana + :width: 800 + + The types of vector memory (VMEM) instructions that were issued. + +.. tip:: + + See :ref:`vmem-instruction-mix` to learn about reported metrics. + +.. _grafana-panel-cu-compute-pipeline: + +Compute Unit - Compute Pipeline +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-sol_panel.png + :align: center + :alt: Speed-of-Light (CU) panel in Omniperf Grafana + :width: 800 + + The number of floating-point and integer operations executed on the vector + arithmetic logic unit (VALU) and Matrix Fused Multiply-Add (MFMA) units in + various precisions. + +.. tip:: + + See :ref:`compute-speed-of-light` to learn about reported metrics. + +Pipeline Stats +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-pipeline-stats_panel.png + :align: center + :alt: Pipeline Stats panel in Omniperf Grafana + :width: 800 + + More detailed metrics to analyze the several independent pipelines found in + the Compute Unit (CU). + +.. tip:: + + See :ref:`pipeline-stats` to learn about reported metrics. + +Arithmetic Operations ++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/cu-arith-ops_panel.png + :align: center + :alt: Arithmetic Operations panel in Omniperf Grafana + :width: 800 + + The total number of floating-point and integer operations executed in various + precisions. + +.. tip:: + + See :ref:`arithmetic-operations` to learn about reported metrics. + +.. _grafana-panel-lds: + +Local Data Share (LDS) +^^^^^^^^^^^^^^^^^^^^^^ + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/lds-sol_panel.png + :align: center + :alt: Speed-of-Light (LDS) panel in Omniperf Grafana + :width: 800 + + Key metrics for the Local Data Share (LDS) as a comparison with the peak + achievable values of those metrics. + +.. tip:: + + See :ref:`lds-sol` to learn about reported metrics. + +LDS Stats ++++++++++ + +.. figure:: ../../data/analyze/grafana/lds-stats_panel.png + :align: center + :alt: LDS Stats panel in Omniperf Grafana + :width: 800 + + More detailed view of the Local Data Share (LDS) performance. + +.. tip:: + + See :ref:`lds-stats` to learn about reported metrics. + +.. _grafana-panel-instruction-cache: + +Instruction Cache +^^^^^^^^^^^^^^^^^ + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/instr-cache-sol_panel.png + :align: center + :alt: Speed-of-Light (instruction cache) panel in Omniperf Grafana + :width: 800 + + Key metrics of the L1 Instruction (L1I) cache as a comparison with the peak + achievable values of those metrics. + +.. tip:: + + See :ref:`desc-l1i-sol` to learn about reported metrics. + +Instruction Cache Stats ++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/instr-cache-accesses_panel.png + :align: center + :alt: Instruction Cache Stats panel in Omniperf Grafana + :width: 800 + + More detail on the hit/miss statistics of the L1 Instruction (L1I) cache. + +.. tip:: + + See :ref:`desc-l1i-stats` to learn about reported metrics. + +.. _grafana-panel-sl1d-cache: + +Scalar L1D Cache +^^^^^^^^^^^^^^^^ + +.. tip:: + + See :ref:`desc-sl1d` to learn about reported metrics. + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/sl1d-sol_panel.png + :align: center + :alt: Speed-of-Light (SL1D) panel in Omniperf Grafana + :width: 800 + + Key metrics of the Scalar L1 Data (sL1D) cache as a comparison with the peak + achievable values of those metrics. + +.. tip:: + + See :ref:`desc-sl1d-sol` to learn about reported metrics. + +Scalar L1D Cache Accesses ++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/sl1d-cache-accesses_panel.png + :align: center + :alt: Scalar L1D Cache Accesses panel in Omniperf Grafana + :width: 800 + + More detail on the types of accesses made to the Scalar L1 Data (sL1D) cache, + and the hit/miss statistics. + +.. tip:: + + See :ref:`desc-sl1d-stats` to learn about reported metrics. + +Scalar L1D Cache - L2 Interface ++++++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/sl1d-l12-interface_panel.png + :align: center + :alt: Scalar L1D Cache - L2 Interface panel in Omniperf Grafana + :width: 800 + + More detail on the data requested across the Scalar L1 Data (sL1D) cache <-> + L2 interface. + +.. tip:: + + See :ref:`desc-sl1d-l2-interface` to learn about reported metrics. + +.. _grafana-panel-ta: + +Texture Address and Texture Data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Texture Addresser ++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/ta_panel.png + :align: center + :alt: Texture Addresser in Omniperf Grafana + :width: 800 + + Metric specific to texture addresser (TA) which receives commands (e.g., + instructions) and write/atomic data from the Compute Unit (CU), and coalesces + them into fewer requests for the cache to process. + +.. tip:: + + See :ref:`desc-ta` to learn about reported metrics. + +.. _grafana-panel-td: + +Texture Data +++++++++++++ + +.. figure:: ../../data/analyze/grafana/td_panel.png + :align: center + :alt: Texture Data panel in Omniperf Grafana + :width: 800 + + Metrics specific to texture data (TD) which routes data back to the + requesting Compute Unit (CU). + +.. tip:: + + See :ref:`desc-td` to learn about reported metrics. + +.. _grafana-panel-vl1d: + +Vector L1 Data Cache +^^^^^^^^^^^^^^^^^^^^ + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/vl1d-sol_panel.png + :align: center + :alt: Speed-of-Light (VL1D) panel in Omniperf Grafana + :width: 800 + + Key metrics of the vector L1 data (vL1D) cache as a comparison with the peak + achievable values of those metrics. + +.. tip:: + + See :ref:`vl1d-sol` to learn about reported metrics. + +L1D Cache Stalls +++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/vl1d-cache-stalls_panel.png + :align: center + :alt: L1D Cache Stalls panel in Omniperf Grafana + :width: 800 + + More detail on where vector L1 data (vL1D) cache is stalled in the pipeline, + which may indicate performance limiters of the cache. + +.. tip:: + + See :ref:`vl1d-cache-stall-metrics` to learn about reported metrics. + +L1D Cache Accesses +++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/vl1d-cache-accesses_panel.png + :align: center + :alt: L1D Cache Accesses + :width: 800 + + The type of requests incoming from the cache front-end, the number of requests + that were serviced by the vector L1 data (vL1D) cache, and the number & type + of outgoing requests to the L2 cache. + +.. tip:: + + See :ref:`vl1d-cache-access-metrics` to learn about reported metrics. + +L1D - L2 Transactions ++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/vl1d-l2-transactions_panel.png + :align: center + :alt: L1D - L2 Transactions in Omniperf Grafana + :width: 800 + + A more granular look at the types of requests made to the L2 cache. + +.. tip:: + + See :ref:`vl1d-l2-transaction-detail` to learn more. + +L1D Addr Translation +++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/vl1d-addr-translation_panel.png + :align: center + :alt: L1D Addr Translation panel in Omniperf Grafana + :width: 800 + + After a vector memory instruction has been processed/coalesced by the address + processing unit of the vector L1 data (vL1D) cache, it must be translated + from a virtual to physical address. These metrics provide more details on the + L1 Translation Lookaside Buffer (TLB) which handles this process. + +.. tip:: + + See :ref:`desc-utcl1` to learn about reported metrics. + +.. _grafana-panel-l2-cache: + +L2 Cache +^^^^^^^^ + +.. tip:: + + See :doc:`/conceptual/l2-cache` to learn about reported metrics. + +Speed-of-Light +++++++++++++++ + +.. figure:: ../../data/analyze/grafana/l2-sol_panel.png + :align: center + :alt: Speed-of-Light (L2 cache) panel in Omniperf Grafana + :width: 800 + + Key metrics about the performance of the L2 cache, aggregated over all the + L2 channels, as a comparison with the peak achievable values of those + metrics. + +.. tip:: + + See :ref:`l2-sol` to learn about reported metrics. + +L2 Cache Accesses ++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/l2-accesses_panel.png + :align: center + :alt: L2 Cache Accesses panel in Omniperf Grafana + :width: 800 + + Incoming requests to the L2 cache from the vector L1 data (vL1D) cache and + other clients (e.g., the sL1D and L1I caches). + +.. tip:: + + See :ref:`l2-cache-accesses` to learn about reported metrics. + +L2 - Fabric Transactions +++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/l2-fabric-transactions_panel.png + :align: center + :alt: L2 - Fabric Transactions panel in Omniperf Grafana + :width: 800 + + More detail on the flow of requests through Infinity Fabric™. + +.. tip:: + + See :ref:`l2-fabric` to learn about reported metrics. + +L2 - Fabric Interface Stalls +++++++++++++++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/l2-fabric-interface-stalls_panel.png + :align: center + :alt: L2 - Fabric Interface Stalls panel in Omniperf Grafana + :width: 800 + + A breakdown of what types of requests in a kernel caused a stall + (e.g., read vs write), and to which locations (e.g., to the accelerator’s + local memory, or to remote accelerators/CPUs). + +.. tip:: + + See :ref:`l2-fabric-stalls` to learn about reported metrics. + +.. _grafana-panel-l2-cache-per-channel: + +L2 Cache Per Channel +^^^^^^^^^^^^^^^^^^^^ + +.. tip:: + + See :ref:`l2-sol` for more information. + +Aggregate Stats ++++++++++++++++ + +.. figure:: ../../data/analyze/grafana/l2-per-channel-agg-stats_panel.png + :align: center + :alt: Aggregate Stats (L2 cache per channel) panel in Omniperf Grafana + :width: 800 + + L2 Cache per channel performance at a glance. Metrics are aggregated over all available channels. diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/mode.rst b/projects/rocprofiler-compute/docs/how-to/analyze/mode.rst new file mode 100644 index 0000000000..b34e1214c4 --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/analyze/mode.rst @@ -0,0 +1,36 @@ +.. meta:: + :description: How to use Omniperf's analyze mode + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + Grafana, analysis, analyze mode + +************ +Analyze mode +************ + +Omniperf offers several ways to interact with the metrics it generates from +profiling. Your level of familiarity with the profiled application, computing +environment, and experience with Omniperf should inform the analysis method you +choose. + +While analyzing with the CLI offers quick and straightforward access to Omniperf +metrics from the terminal, Grafana's dashboard GUI adds an extra layer of +readability and interactivity you might prefer. + +See the following sections to explore Omniperf's analysis and visualization +options. + +* :doc:`cli` +* :doc:`grafana-gui` +* :doc:`standalone-gui` + +.. note:: + + Analysis examples in this chapter borrow profiling results from the + ``vcopy.cpp`` workload introduced in :ref:`profile-example` in the + previous chapter. + + Unless otherwise noted, the performance analysis is done on the + :ref:`MI200 platform `. + +Learn about profiling with Omniperf in :doc:`../profile/mode`. For an overview of +Omniperf's other modes, see :ref:`modes`. diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst b/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst new file mode 100644 index 0000000000..a6a3e26f36 --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst @@ -0,0 +1,89 @@ +.. meta:: + :description: Omniperf analysis: Standalone GUI + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, GUI, standalone, filter + +*********************** +Standalone GUI analysis +*********************** + +Omniperf's standalone analysis GUI is a lightweight web page that you can +generate straight from the command line. The standalone analysis GUI is an +alternative to the CLI if you want to explore profiling results visually, but +without the additional setup requirements or server-side overhead of Omniperf's +detailed :doc:`Grafana interface ` option. This analysis +option is implemented as a simple `Flask `_ +application that lets you view results from your preferred web browser. + +.. note:: + + A point on *port forwarding*: the standalone GUI analyzer publishes its + web-based interface on port ``8050`` by default. On production HPC systems + where profiling jobs run under the control of a resource manager, additional + SSH tunneling between the desired web browser host (such as a login node or + remote workstation) and compute host may be required. Alternatively, you + might find it more convenient to download profiled workloads to perform + analysis on a local system. + + See the :doc:`/reference/faq` for more details on SSH tunneling. + +Launch the standalone GUI analyzer +---------------------------------- + +To launch the Omniperf GUI analyzer, include the ``--gui`` flag with your +desired analysis command. For example: + +.. code-block:: shell + + $ omniperf analyze -p workloads/vcopy/MI200/ --gui + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Analysis mode = web_ui + [analysis] deriving Omniperf metrics... + Dash is running on http://0.0.0.0:8050/ + + * Serving Flask app 'omniperf_analyze.analysis_webui' (lazy loading) + * Environment: production + WARNING: This is a development server. Do not use it in a production deployment. + Use a production WSGI server instead. + * Debug mode: off + * Running on all addresses (0.0.0.0) + WARNING: This is a development server. Do not use it in a production deployment. + * Running on http://127.0.0.1:8050 + * Running on http://10.228.33.172:8050 (Press CTRL+C to quit) + +At this point, you can launch your web browser of choice and navigate to +``http://localhost:8050/`` to view the analysis interface. + +.. image:: ../../data/analyze/standalone_gui.png + :align: center + :alt: Omniperf standalone GUI home screen + :width: 800 + +.. tip:: + + To launch the standalone GUI analyzer web app on a port other than ``8050``, + include the optional argument ``--gui ``. + +When no filters are applied, you'll see five basic sections derived from your +application's profiling data: + +#. Memory Chart Analysis +#. Empirical Roofline Analysis +#. Top Stats (Top Kernel Statistics) +#. System Info +#. System Speed-of-Light + +To dive deeper, use the dropdown menus at the top of the screen to isolate +particular kernels or dispatches. You should see the web page update with +metrics specific to your selected filters. + +Once a filter is applied, you'll see several additional sections become +available with detailed metrics specific to that area of AMD hardware. These +detailed sections mirror the data displayed in Omniperf's +:doc:`Grafana interface `. diff --git a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst new file mode 100644 index 0000000000..de23a801ba --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst @@ -0,0 +1,455 @@ +.. meta:: + :description: How to use Omniperf's profile mode + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + profiling, profile mode + +************ +Profile mode +************ + +The following chapter walks you through Omniperf's core profiling features by +example. + +Learn about analysis with Omniperf in :doc:`../analyze/mode`. For an overview of +Omniperf's other modes, see :ref:`modes`. + +Profiling +========= + +Use the ``omniperf`` executable to acquire all necessary performance monitoring +data through analysis of compute workloads. + +Profiling with Omniperf yields the following benefits. + +* :ref:`Automate counter collection `: Omniperf handles all + of your profiling via pre-configured input files. + +* :ref:`Filtering `: Apply runtime filters to speed up the profiling + process. + +* :ref:`Standalone roofline `: Isolate a subset of built-in + metrics or build your own profiling configuration. + +Run ``omniperf profile -h`` for more details. See +:ref:`Basic usage `. + +.. _profile-example: + +Profiling example +----------------- + +The ``__ repository +includes source code for a sample GPU compute workload, ``vcopy.cpp``. A copy of +this file is available in the ``share/sample`` subdirectory after a normal +Omniperf installation, or via the ``$OMNIPERF_SHARE/sample`` directory when +using the supplied modulefile. + +The examples in this section use a compiled version of the ``vcopy`` workload to +demonstrate the use of Omniperf in MI accelerator performance analysis. Unless +otherwise noted, the performance analysis is done on the +:ref:`MI200 platform `. + +Workload compilation +^^^^^^^^^^^^^^^^^^^^ + +The following example demonstrates compilation of ``vcopy``. + +.. code-block:: shell + + $ hipcc vcopy.cpp -o vcopy + $ ls + vcopy vcopy.cpp + $ ./vcopy -n 1048576 -b 256 + vcopy testing on GCD 0 + Finished allocating vectors on the CPU + Finished allocating vectors on the GPU + Finished copying vectors to the GPU + sw thinks it moved 1.000000 KB per wave + Total threads: 1048576, Grid Size: 4096 block Size:256, Wavefronts:16384: + Launching the kernel on the GPU + Finished executing kernel + Finished copying the output vector from the GPU to the CPU + Releasing GPU memory + Releasing CPU memory + +The following sample command profiles the ``vcopy`` workload. + +.. code-block:: shell + + $ omniperf profile --name vcopy -- ./vcopy -n 1048576 -b 256 + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Omniperf version: 2.0.0 + Profiler choice: rocprofv1 + Path: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + Target: MI200 + Command: ./vcopy -n 1048576 -b 256 + Kernel Selection: None + Dispatch Selection: None + Hardware Blocks: All + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Collecting Performance Counters + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + [profiling] Current input file: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200/perfmon/SQ_IFETCH_LEVEL.txt + |-> [rocprof] RPL: on '240312_174329' from '/opt/rocm-5.2.1' in '/home/auser/repos/omniperf/src/omniperf' + |-> [rocprof] RPL: profiling '""./vcopy -n 1048576 -b 256""' + |-> [rocprof] RPL: input file '/home/auser/repos/omniperf/sample/workloads/vcopy/MI200/perfmon/SQ_IFETCH_LEVEL.txt' + |-> [rocprof] RPL: output dir '/tmp/rpl_data_240312_174329_692890' + |-> [rocprof] RPL: result dir '/tmp/rpl_data_240312_174329_692890/input0_results_240312_174329' + |-> [rocprof] ROCProfiler: input from "/tmp/rpl_data_240312_174329_692890/input0.xml" + |-> [rocprof] gpu_index = + |-> [rocprof] kernel = + |-> [rocprof] range = + |-> [rocprof] 6 metrics + |-> [rocprof] GRBM_COUNT, GRBM_GUI_ACTIVE, SQ_WAVES, SQ_IFETCH, SQ_IFETCH_LEVEL, SQ_ACCUM_PREV_HIRES + |-> [rocprof] vcopy testing on GCD 0 + |-> [rocprof] Finished allocating vectors on the CPU + |-> [rocprof] Finished allocating vectors on the GPU + |-> [rocprof] Finished copying vectors to the GPU + |-> [rocprof] sw thinks it moved 1.000000 KB per wave + |-> [rocprof] Total threads: 1048576, Grid Size: 4096 block Size:256, Wavefronts:16384: + |-> [rocprof] Launching the kernel on the GPU + |-> [rocprof] Finished executing kernel + |-> [rocprof] Finished copying the output vector from the GPU to the CPU + |-> [rocprof] Releasing GPU memory + |-> [rocprof] Releasing CPU memory + |-> [rocprof] + |-> [rocprof] ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_240312_174329_692890/input0_results_240312_174329 + |-> [rocprof] File '/home/auser/repos/omniperf/sample/workloads/vcopy/MI200/SQ_IFETCH_LEVEL.csv' is generating + |-> [rocprof] + [profiling] Current input file: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200/perfmon/SQ_INST_LEVEL_LDS.txt + + ... + + [roofline] Checking for roofline.csv in /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + [roofline] No roofline data found. Generating... + Empirical Roofline Calculation + Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. + Total detected GPU devices: 4 + GPU Device 0: Profiling... + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + HBM BW, GPU ID: 0, workgroupSize:256, workgroups:2097152, experiments:100, traffic:8589934592 bytes, duration:6.2 ms, mean:1388.0 GB/sec, stdev=3.1 GB/sec + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + L2 BW, GPU ID: 0, workgroupSize:256, workgroups:8192, experiments:100, traffic:687194767360 bytes, duration:136.5 ms, mean:5020.8 GB/sec, stdev=16.5 GB/sec + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + L1 BW, GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, traffic:26843545600 bytes, duration:2.9 ms, mean:9229.5 GB/sec, stdev=2.9 GB/sec + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + LDS BW, GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, traffic:33554432000 bytes, duration:1.9 ms, mean:17645.6 GB/sec, stdev=20.1 GB/sec + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak FLOPs (FP32), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:274877906944, duration:13.078 ms, mean:20986.9 GFLOPS, stdev=310.8 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak FLOPs (FP64), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:137438953472, duration:6.7 ms, mean:20408.029297.1 GFLOPS, stdev=2.7 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak MFMA FLOPs (BF16), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:2147483648000, duration:12.6 ms, mean:170280.0 GFLOPS, stdev=22.3 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak MFMA FLOPs (F16), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:2147483648000, duration:13.0 ms, mean:164733.6 GFLOPS, stdev=24.3 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak MFMA FLOPs (F32), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:536870912000, duration:13.0 ms, mean:41399.6 GFLOPS, stdev=4.1 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak MFMA FLOPs (F64), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, FLOP:268435456000, duration:6.5 ms, mean:41379.2 GFLOPS, stdev=4.4 GFLOPS + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + Peak MFMA IOPs (I8), GPU ID: 0, workgroupSize:256, workgroups:16384, experiments:100, IOP:2147483648000, duration:12.9 ms, mean:166281.9 GOPS, stdev=2495.9 GOPS + GPU Device 1: Profiling... + ... + GPU Device 2: Profiling... + ... + GPU Device 3: Profiling... + ... + +.. tip:: + + To reduce verbosity of profiling output try the ``--quiet`` flag. This hides + ``rocprof`` output and activates a progress bar. + +.. _profiling-routine: + +Notice the two main stages in Omniperf's *default* profiling routine. + +1. The first stage collects all the counters needed for Omniperf analysis + (omitting any filters you have provided). + +2. The second stage collects data for the roofline analysis (this stage can be + disabled using ``--no-roof``). + +At the end of profiling, you can find all resulting ``csv`` files in a +:ref:`SoC `-specific target directory; for +example: + +* "MI300A" or "MI300X" for the AMD Instinct™ MI300 family of accelerators +* "MI200" for the AMD Instinct MI200 family of accelerators +* "MI100" for the AMD Instinct MI100 family of accelerators + +The SoC names are generated as a part of Omniperf, and do not *always* +distinguish between different accelerators in the same family; for instance, +an Instinct MI210 vs an Instinct MI250. + +.. note:: + + Additionally, you will notice a few extra files. An SoC parameters file, + ``sysinfo.csv``, is created to reflect the target device settings. All + profiling output is stored in ``log.txt``. Roofline-specific benchmark + results are stored in ``roofline.csv``. + +.. code-block:: shell + + $ ls workloads/vcopy/MI200/ + total 112 + total 60 + -rw-r--r-- 1 auser agroup 27937 Mar 1 15:15 log.txt + drwxr-xr-x 1 auser agroup 0 Mar 1 15:15 perfmon + -rw-r--r-- 1 auser agroup 26175 Mar 1 15:15 pmc_perf.csv + -rw-r--r-- 1 auser agroup 1708 Mar 1 15:17 roofline.csv + -rw-r--r-- 1 auser agroup 519 Mar 1 15:15 SQ_IFETCH_LEVEL.csv + -rw-r--r-- 1 auser agroup 456 Mar 1 15:15 SQ_INST_LEVEL_LDS.csv + -rw-r--r-- 1 auser agroup 474 Mar 1 15:15 SQ_INST_LEVEL_SMEM.csv + -rw-r--r-- 1 auser agroup 474 Mar 1 15:15 SQ_INST_LEVEL_VMEM.csv + -rw-r--r-- 1 auser agroup 599 Mar 1 15:15 SQ_LEVEL_WAVES.csv + -rw-r--r-- 1 auser agroup 650 Mar 1 15:15 sysinfo.csv + -rw-r--r-- 1 auser agroup 399 Mar 1 15:15 timestamps.csv + +.. _filtering: + +Filtering +========= + +To reduce profiling time and the counters collected, you should use profiling +filters. Profiling filters and their functionality depend on the underlying +profiler being used. While Omniperf is profiler-agnostic, this following is a +detailed description of profiling filters available when using Omniperf with +:doc:`ROCProfiler `. + +Filtering options +----------------- + +``-b``, ``--block `` + Allows system profiling on one or more selected hardware components to speed + up the profiling process. See :ref:`profiling-hw-component-filtering`. + +``-k``, ``--kernel `` + Allows for kernel filtering. Usage is equivalent with the current ``rocprof`` + utility. See :ref:`profiling-kernel-filtering`. + +``-d``, ``--dispatch `` + Allows for dispatch ID filtering. Usage is equivalent with the current + ``rocprof`` utility. See :ref:`profiling-dispatch-filtering`. + +.. tip:: + + Be cautious when combining different profiling filters in the same call. + Conflicting filters may result in error. + + For example, filtering a dispatch, but that dispatch doesn't match your + kernel name filter. + +.. _profiling-hw-component-filtering: + +Hardware component filtering +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can profile specific hardware components to speed up the profiling process. +In Omniperf, the term hardware block to refers to a hardware component or a +group of hardware components. All profiling results are accumulated in the same +target directory without overwriting those for other hardware components. This +enables incremental profiling and analysis. + +The following example only gathers hardware counters for the shader sequencer +(SQ) and L2 cache (TCC) components, skipping all other hardware components. + +.. code-block:: shell + + $ omniperf profile --name vcopy -b SQ TCC -- ./vcopy -n 1048576 -b 256 + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + fname: pmc_cpc_perf: Skipped + fname: pmc_spi_perf: Skipped + fname: pmc_cpf_perf: Skipped + fname: pmc_tcp_perf: Skipped + fname: pmc_sq_perf4: Added + fname: pmc_tcc_perf: Added + fname: pmc_sq_perf8: Added + fname: pmc_ta_perf: Skipped + fname: pmc_sq_perf1: Added + fname: pmc_sq_perf3: Added + fname: pmc_td_perf: Skipped + fname: pmc_tcc2_perf: Skipped + fname: pmc_sqc_perf1: Skipped + fname: pmc_sq_perf6: Added + fname: pmc_sq_perf2: Added + Omniperf version: 2.0.0 + Profiler choice: rocprofv1 + Path: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + Target: MI200 + Command: ./vcopy -n 1048576 -b 256 + Kernel Selection: None + Dispatch Selection: None + Hardware Blocks: ['sq', 'tcc'] + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Collecting Performance Counters + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ... + +.. _profiling-kernel-filtering: + +Kernel filtering +^^^^^^^^^^^^^^^^ + +Kernel filtering is based on the name of the kernels you want to isolate. Use a +kernel name substring list to isolate desired kernels. + +The following example demonstrates profiling isolating the kernel matching +substring ``vecCopy``. + +.. code-block:: shell + + $ omniperf profile --name vcopy -k vecCopy -- ./vcopy -n 1048576 -b 256 + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Omniperf version: 2.0.0 + Profiler choice: rocprofv1 + Path: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + Target: MI200 + Command: ./vcopy -n 1048576 -b 256 + Kernel Selection: ['vecCopy'] + Dispatch Selection: None + Hardware Blocks: All + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Collecting Performance Counters + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ... + +.. _profiling-dispatch-filtering: + +Dispatch filtering +^^^^^^^^^^^^^^^^^^ + +Dispatch filtering is based on the *global* dispatch index of kernels in a run. + +The following example profiles only the first kernel dispatch in the execution +of the application (note zero-based indexing). + +.. code-block:: shell + + $ omniperf profile --name vcopy -d 0 -- ./vcopy -n 1048576 -b 256 + + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| + | | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ + | |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| + + Omniperf version: 2.0.0 + Profiler choice: rocprofv1 + Path: /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + Target: MI200 + Command: ./vcopy -n 1048576 -b 256 + Kernel Selection: None + Dispatch Selection: ['0'] + Hardware Blocks: All + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Collecting Performance Counters + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ... + +.. _standalone-roofline: + +Standalone roofline +=================== + +If you are only interested in generating roofline analysis data try using +``--roof-only``. This will only collect counters relevant to roofline, as well +as generate a standalone ``.pdf`` output of your roofline plot. + +Roofline options +---------------- + +``--sort `` + Allows you to specify whether you would like to overlay top kernel or top + dispatch data in your roofline plot. + +``-m``, ``--mem-level `` + Allows you to specify specific levels of cache to include in your roofline + plot. + +``--device `` + Allows you to specify a device ID to collect performance data from when + running a roofline benchmark on your system. + +To distinguish different kernels in your ``.pdf`` roofline plot use +``--kernel-names``. This will give each kernel a unique marker identifiable from +the plot's key. + + +Roofline only +------------- + +The following example demonstrates profiling roofline data only: + +.. code-block:: shell + + $ omniperf profile --name vcopy --roof-only -- ./vcopy -n 1048576 -b 256 + + ... + [roofline] Checking for roofline.csv in /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + [roofline] No roofline data found. Generating... + Checking for roofline.csv in /home/auser/repos/omniperf/sample/workloads/vcopy/MI200 + Empirical Roofline Calculation + Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. + Total detected GPU devices: 4 + GPU Device 0: Profiling... + 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] + ... + Empirical Roofline PDFs saved! + +An inspection of our workload output folder shows ``.pdf`` plots were generated +successfully. + +.. code-block:: shell + + $ ls workloads/vcopy/MI200/ + total 48 + -rw-r--r-- 1 auser agroup 13331 Mar 1 16:05 empirRoof_gpu-0_fp32_fp64.pdf + -rw-r--r-- 1 auser agroup 13136 Mar 1 16:05 empirRoof_gpu-0_int8_fp16.pdf + drwxr-xr-x 1 auser agroup 0 Mar 1 16:03 perfmon + -rw-r--r-- 1 auser agroup 1101 Mar 1 16:03 pmc_perf.csv + -rw-r--r-- 1 auser agroup 1715 Mar 1 16:05 roofline.csv + -rw-r--r-- 1 auser agroup 650 Mar 1 16:03 sysinfo.csv + -rw-r--r-- 1 auser agroup 399 Mar 1 16:03 timestamps.csv + +.. note:: + + Omniperf generates two roofline outputs to organize results and reduce + clutter. One chart plots FP32/FP64 performance while the other plots I8/FP16 + performance. + +The following image is a sample ``empirRoof_gpu-ALL_fp32_fp64.pdf`` roofline +plot. + +.. image:: ../../data/profile/sample-roof-plot.png + :align: center + :alt: Sample Omniperf roofline output + :width: 800 + diff --git a/projects/rocprofiler-compute/docs/how-to/use.rst b/projects/rocprofiler-compute/docs/how-to/use.rst new file mode 100644 index 0000000000..7377dd9f95 --- /dev/null +++ b/projects/rocprofiler-compute/docs/how-to/use.rst @@ -0,0 +1,251 @@ +.. meta:: + :description: Omniperf basic usage + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + basics, usage, operations + +*********** +Basic usage +*********** + +The following section outlines basic Omniperf workflows, modes, options, and +operations. + +Command line profiler +===================== + +Launch and profile the target application using the command line profiler. + +The command line profiler launches the target application, calls the +ROCProfiler API via the ``rocprof`` binary, and collects profile results for +the specified kernels, dispatches, and hardware components. If not +specified, Omniperf defaults to collecting all available counters for all +kernels and dispatches launched by the your executable. + +To collect the default set of data for all kernels in the target +application, launch, for example: + +.. code-block:: shell + + $ omniperf profile -n vcopy_data -- ./vcopy -n 1048576 -b 256 + +This runs the app, launches each kernel, and generates profiling results. By +default, results are written to a subdirectory with your accelerator's name; +for example, ``./workloads/vcopy_data/MI200/``, where name is configurable +via the ``-n`` argument. + +.. note:: + + To collect all requested profile information, Omniperf might replay kernels + multiple times. + +.. _basic-filter-data-collection: + +Customize data collection +------------------------- + +Options are available to specify for which kernels and metrics data should be +collected. Note that you can apply filtering in either the profiling or +analysis stage. Filtering at profiling collection often speeds up your +aggregate profiling run time. + +Common filters to customize data collection include: + +``-k``, ``--kernel`` + Enables filtering kernels by name. + +``-d``, ``--dispatch`` + Enables filtering based on dispatch ID. + +``-b``, ``--block`` + Enables collection metrics for only the specified (one or more) hardware + component blocks. + +See :ref:`Filtering ` for an in-depth walkthrough. + +To view available metrics by hardware block, use the ``--list-metrics`` +argument: + +.. code-block:: shell + + $ omniperf analyze --list-metrics + +.. _basic-analyze-cli: + +Analyze in the command line +--------------------------- + +After generating a local output folder (for example, +``./workloads/vcopy_data/MI200``), use the command line tool to quickly +interface with profiling results. View different metrics derived from your +profiled results and get immediate access all metrics organized by hardware +blocks. + +If you don't apply kernel, dispatch, or hardware block filters at this stage, +analysis is reflective of the entirety of the profiling data. + +To interact with profiling results from a different session, provide the +workload path. + +``-p``, ``--path`` + Enables you to analyze existing profiling data in the Omniperf CLI. + +See :doc:`analyze/cli` for more detailed information. + +.. _basic-analyze-grafana: + +Analyze in the Grafana GUI +-------------------------- + +To conduct a more in-depth analysis of profiling results, it's suggested to use +a Grafana GUI with Omniperf. To interact with profiling results, import your +data to the MongoDB instance included in the Omniperf Dockerfile. See +:doc:`/install/grafana-setup`. + +To interact with Grafana data, stored in the Omniperf database, enter +``database`` :ref:`mode `; for example: + +.. code-block:: shell + + $ omniperf database --import [CONNECTION OPTIONS] + +See :doc:`/how-to/analyze/grafana-gui` for more detailed information. + +.. _modes: + +Modes +===== + +Modes change the fundamental behavior of the Omniperf command line tool. +Depending on which mode you choose, different command line options become +available. + +.. _modes-profile: + +Profile mode +------------ + +``profile`` + Launches the target application on the local system using + :doc:`ROCProfiler `. Depending on the profiling options + chosen, selected kernels, dispatches, and or hardware components used by the + application are profiled. It stores results locally in an output folder: + ``./workloads/\``. + + .. code-block:: shell + + $ omniperf profile --help + +See :doc:`profile/mode` to learn about this mode in depth and to get started +profiling with Omniperf. + +.. _modes-analyze: + +Analyze mode +------------ + +``analyze`` + Loads profiling data from the ``--path`` (``-p``) directory into the Omniperf + CLI analyzer where you have immediate access to profiling results and + generated metrics. It generates metrics from the entirety of your profiled + application or a subset identified through the Omniperf CLI analysis filters. + + To generate a lightweight GUI interface, you can add the ``--gui`` flag to your + analysis command. + + This mode is a middle ground to the highly detailed Omniperf Grafana GUI and + is great if you want immediate access to a hardware component you’re already + familiar with. + + .. code-block:: shell + + $ omniperf analyze --help + +See :doc:`analyze/mode` to learn about this mode in depth and to get started +with analysis using Omniperf. + +.. _modes-database: + +Database mode +------------- + +``database`` + The Grafana analyzer GUI is built on a MongoDB database. ``--import`` + profiling results to the DB to interact with the workload in Grafana or + ``--remove`` the workload from the DB. + + Connection options need to be specified. See :doc:`/how-to/analyze/grafana-gui` for + more details. + + .. code-block:: shell + + $ omniperf database --help + +See :doc:`/install/grafana-setup` to learn about setting up a Grafana server and +database instance to make your profiling data more digestible and shareable. + +.. _global-options: + +Global options +============== + +The Omniperf command line tool has a set of *global* utility options that are +available across all modes. + +``-v``, ``--version`` + Prints the Omniperf version and exits. + +``-V``, ``--verbose`` + Increases output verbosity. Use multiple times for higher levels of + verbosity. + +``-q``, ``--quiet`` + Reduces output verbosity and runs quietly. + +``-s``, ``--specs`` + Prints system specs and exits. + +.. note:: + + Omniperf also recognizes the project variable, ``OMNIPERF_COLOR`` should you + choose to disable colorful output. To disable default colorful behavior, set + this variable to ``0``. + +.. _basic-operations: + +Basic operations +================ + +The following table lists Omniperf's basic operations, their +:ref:`modes `, and required arguments. + +.. list-table:: + :header-rows: 1 + + * - Operation description + - Mode + - Required arguments + + * - :doc:`Profile a workload ` + - ``profile`` + - ``--name``, ``-- `` + + * - :ref:`Standalone roofline analysis ` + - ``profile`` + - ``--name``, ``--roof-only``, ``-- `` + + * - :ref:`Import a workload to database ` + - ``database`` + - ``--import``, ``--host``, ``--username``, ``--workload``, ``--team`` + + * - :ref:`Remove a workload from database ` + - ``database`` + - ``--remove``, ``--host``, ``--username``, ``--workload``, ``--team`` + + * - :doc:`Launch standalone GUI from CLI ` + - ``analyze`` + - ``--path``, ``--gui`` + + * - :doc:`Interact with profiling results from CLI ` + - ``analyze`` + - ``--path`` + diff --git a/projects/rocprofiler-compute/docs/index.rst b/projects/rocprofiler-compute/docs/index.rst new file mode 100644 index 0000000000..1df329e7de --- /dev/null +++ b/projects/rocprofiler-compute/docs/index.rst @@ -0,0 +1,87 @@ +.. meta:: + :description: Omniperf documentation and reference + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD + +********************** +Omniperf documentation +********************** + +Omniperf documentation provides a comprehensive overview of Omniperf. +In addition to a full deployment guide with installation instructions, this +documentation also explains the ideas motivating the design behind the tool and +its components. + +If you're new to Omniperf, familiarize yourself with the tool by reviewing the +chapters that follow and gradually learn its more advanced features. To get +started, see :doc:`What is Omniperf? `. + +Omniperf is open source and hosted at ``__. + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: Install + + * :doc:`install/core-install` + * :doc:`Grafana server for Omniperf ` + + .. grid-item:: + +Use the following topics to learn more about the advantages of Omniperf in your +development toolkit, how it aims to model performance, and how to use Omniperf +in practice. + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: How to + + * :doc:`how-to/use` + + * :doc:`how-to/profile/mode` + + * :doc:`how-to/analyze/mode` + + * :doc:`how-to/analyze/cli` + + * :doc:`how-to/analyze/grafana-gui` + + * :doc:`how-to/analyze/standalone-gui` + + .. grid-item-card:: Conceptual + + * :doc:`conceptual/performance-model` + + * :doc:`conceptual/compute-unit` + + * :doc:`conceptual/l2-cache` + + * :doc:`conceptual/shader-engine` + + * :doc:`conceptual/command-processor` + + * :doc:`conceptual/system-speed-of-light` + + * :doc:`conceptual/definitions` + + * :ref:`normalization-units` + + .. grid-item-card:: Tutorials + + * :doc:`tutorial/profiling-by-example` + + * :doc:`Learning resources ` + + .. grid-item-card:: Reference + + * :doc:`reference/compatible-accelerators` + + * :doc:`reference/faq` + +This project is proudly open source. For more details on how to contribute, +refer to +`Contributing to ROCm `_. + +Find ROCm licensing information on the +`Licensing `_ page. + diff --git a/projects/rocprofiler-compute/docs/install/core-install.rst b/projects/rocprofiler-compute/docs/install/core-install.rst new file mode 100644 index 0000000000..1d28b07b5d --- /dev/null +++ b/projects/rocprofiler-compute/docs/install/core-install.rst @@ -0,0 +1,236 @@ +.. meta:: + :description: Omniperf installation and deployment + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + install, deploy, Grafana, client, configuration, modulefiles + +********************************* +Installing and deploying Omniperf +********************************* + +Omniperf consists of two installation components. + +* :ref:`Omniperf core installation ` (client-side) + + * Provides the core application profiling capability. + * Allows the collection of performance counters, filtering by hardware + block, dispatch, kernel, and more. + * Provides a CLI-based analysis mode. + * Provides a standalone web interface for importing analysis metrics. + +* :doc:`Grafana server for Omniperf ` (server-side) (*optional*) + + * Hosts the MongoDB backend and Grafana instance. + * Is packaged in a Docker container for easy setup. + +Determine what you need to install based on how you would like to interact with +Omniperf. See the following decision tree to help determine what installation is +right for you. + +.. image:: ../data/install/install-decision-tree.png + :align: center + :alt: Decision tree for installing and deploying Omniperf + :width: 800 + +.. _core-install: + +Core installation +================= + +The core Omniperf application requires the following basic software +dependencies. As of ROCm 6.2, the core Omniperf is included with your ROCm +installation. + +* Python ``>= 3.8`` +* CMake ``>= 3.19`` +* ROCm ``>= 5.7.1`` + +Omniperf depends on a number of Python packages documented in the top-level +``requirements.txt`` file. Install these *before* configuring Omniperf. + +.. tip:: + + If looking to build Omniperf as a developer, consider these additional + requirements. + + .. list-table:: + + * - ``docs/sphinx/requirements.txt`` + - Python packages required to build this documentation from source. + + * - ``requirements-test.txt`` + - Python packages required to run Omniperf's CI suite using PyTest. + +The recommended procedure for Omniperf usage is to install into a shared file +system so that multiple users can access the final installation. The +following steps illustrate how to install the necessary Python dependencies +using `pip `_ and Omniperf into a +shared location controlled by the ``INSTALL_DIR`` environment variable. + +.. _core-install-cmake-vars: + +Configuration variables +----------------------- +The following installation example leverages several +`CMake `_ project variables defined as +follows. + +.. list-table:: + :header-rows: 1 + + * - CMake variable + - Description + + * - ``CMAKE_INSTALL_PREFIX`` + - Controls the install path for Omniperf files. + + * - ``PYTHON_DEPS`` + - Specifies an optional path to resolve Python package dependencies. + + * - ``MOD_INSTALL_PATH`` + - Specifies an optional path for separate Omniperf modulefile installation. + +.. _core-install-steps: + +Install from source +------------------- + +#. A typical install begins by downloading the latest release tarball available + from ``__. From there, untar and + navigate into the top-level directory. + + .. + {{ config.version }} substitutes the Omniperf version in ../conf.py + + .. datatemplate:nodata:: + + .. code-block:: shell + + tar xfz omniperf-v{{ config.version }}.tar.gz + cd omniperf-v{{ config.version }} + +#. Next, install Python dependencies and complete the Omniperf configuration and + install process. + + .. datatemplate:nodata:: + + .. code-block:: shell + + # define top-level install path + export INSTALL_DIR= + + # install python deps + python3 -m pip install -t ${INSTALL_DIR}/python-libs -r requirements.txt + + # configure Omniperf for shared install + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/{{ config.version }} \ + -DPYTHON_DEPS=${INSTALL_DIR}/python-libs \ + -DMOD_INSTALL_PATH=${INSTALL_DIR}/modulefiles .. + + # install + make install + + .. tip:: + + You might need to ``sudo`` the final installation step if you don't have + write access for the chosen installation path. + +#. Upon successful installation, your top-level installation directory should + look like this. + + .. datatemplate:nodata:: + + .. code-block:: shell + + $ ls $INSTALL_DIR + modulefiles {{ config.version }} python-libs + +.. _core-install-modulefiles: + +Execution using modulefiles +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The installation process includes the creation of an environment modulefile for +use with `Lmod `_. On systems that support Lmod, +you can register the Omniperf modulefile directory and setup your environment +for execution of Omniperf as follows. + +.. datatemplate:nodata:: + + .. code-block:: shell + + $ module use $INSTALL_DIR/modulefiles + $ module load omniperf + $ which omniperf + /opt/apps/omniperf/{{ config.version }}/bin/omniperf + + $ omniperf --version + ROC Profiler: /opt/rocm-5.1.0/bin/rocprof + + omniperf (v{{ config.version }}) + +.. tip:: + + If you're relying on an Lmod Python module locally, you may wish to customize + the resulting Omniperf modulefile post-installation to include extra + module dependencies. + +Execution without modulefiles +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To use Omniperf without the companion modulefile, update your ``PATH`` +settings to enable access to the command line binary. If you installed Python +dependencies in a shared location, also update your ``PYTHONPATH`` +configuration. + +.. datatemplate:nodata:: + + .. code-block:: shell + + export PATH=$INSTALL_DIR/{{ config.version }}/bin:$PATH + export PYTHONPATH=$INSTALL_DIR/python-libs + +.. _core-install-package: + +Install via package manager +--------------------------- + +Once ROCm (minimum version 6.2.0) is installed, you can install Omniperf using +your operating system's native package manager using the following commands. +See :doc:`rocm-install-on-linux:index` for guidance on installing the ROCm +software stack. + +.. tab-set:: + + .. tab-item:: Ubuntu + + .. code-block:: shell + + $ sudo apt install omniperf + $ pip install -r /opt/rocm/libexec/omniperf/requirements.txt + + .. tab-item:: Red Hat Enterprise Linux + + .. code-block:: shell + + $ sudo dnf install omniperf + $ pip install -r /opt/rocm/libexec/omniperf/requirements.txt + + .. tab-item:: SUSE Linux Enterprise Server + + .. code-block:: shell + + $ sudo zypper install omniperf + $ pip install -r /opt/rocm/libexec/omniperf/requirements.txt + +.. _core-install-rocprof-var: + +ROCProfiler +----------- + +Omniperf relies on :doc:`ROCProfiler `'s ``rocprof`` binary +during the profiling process. Normally, the path to this binary is detected +automatically, but you can override the path by the setting the optional +``ROCPROF`` environment variable. + diff --git a/projects/rocprofiler-compute/docs/install/grafana-setup.rst b/projects/rocprofiler-compute/docs/install/grafana-setup.rst new file mode 100644 index 0000000000..a7486d286d --- /dev/null +++ b/projects/rocprofiler-compute/docs/install/grafana-setup.rst @@ -0,0 +1,209 @@ +.. meta:: + :description: Omniperf Grafana server installation and deployment + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + install, deploy, Grafana, server, configuration, GUI + +**************************************** +Setting up a Grafana server for Omniperf +**************************************** + +A Grafana server is *not required* to profile or analyze performance data +from the CLI. It's a supplementary mechanism to help you import performance +data and examine it in a detailed +`Grafana `_ dashboard GUI. + +Learn about installing and configuring the main Omniperf tool in +:ref:`core-install`. + +Setting up a Grafana instance for Omniperf requires the following basic software +dependencies. + +* `Docker Engine `_ + +The recommended process for enabling the server-side of Omniperf is to use the +provided ``Dockerfile`` to build the Grafana and MongoDB instance. + +.. _grafana-mongodb-setup: + +Set up Grafana and MongoDB +========================== + +Once you've decided where to host the Grafana and MongoDB instance, complete the +the following setup instructions. + +Install MongoDB utilities +------------------------- + +Omniperf uses the +`mongoimport `_ +utility to upload data to your Grafana instance's backend database. + +Use the following commands to install MongoDB utilities for Ubuntu 20.04. + +.. code-block:: bash + + $ wget https://fastdl.mongodb.org/tools/db/mongodb-database-tools-ubuntu2004-x86_64-100.6.1.deb + $ sudo apt install ./mongodb-database-tools-ubuntu2004-x86_64-100.6.1.deb + +.. note:: + + Find installation instructions for other distributions in + `MongoDB Database Tools Downloads `_. + +.. _grafana-persistent-storage-setup: + +Set up persistent storage +------------------------- + +Bind MongoDB to a directory on the host OS to create a local backup in case of a +crash or reset. This is called *creating a persistent volume*. + +.. code-block:: bash + + $ sudo mkdir -p /usr/local/persist && cd /usr/local/persist/ + $ sudo mkdir -p grafana-storage mongodb + $ sudo docker volume create --driver local --opt type=none --opt device=/usr/local/persist/grafana-storage --opt o=bind grafana-storage + $ sudo docker volume create --driver local --opt type=none --opt device=/usr/local/persist/mongodb --opt o=bind grafana-mongo-db + +.. _grafana-docker-container: + +Build and launch the Docker container +------------------------------------- + +You're now ready to build your ``Dockerfile``. Navigate to your Omniperf install +directory to begin. + +.. code-block:: bash + + $ cd grafana + $ sudo docker-compose build + $ sudo docker-compose up -d + +The TCP ports for Grafana (``4000``) and MongoDB (``27017``) in the Docker +container are mapped to ``14000`` and ``27018``, respectively, on the host side. + +.. tip:: + + In the event that either your Grafana or MongoDB instance crashes fatally, + just restart the server. Navigate to your install directory and run: + + .. code-block:: + + $ sudo docker-compose down + $ sudo docker-compose up -d + +.. _grafana-dashboard-setup: + +Set up the Grafana dashboard +---------------------------- + +Once you've launched your Docker container you should be able to reach Grafana +at ``http://:14000``. The default login credentials for your first-time +Grafana setup are: + +* **Username**: ``admin`` +* **Password**: ``admin`` + +.. figure:: ../data/install/grafana_welcome.png + :align: center + :alt: Grafana dashboard welcome screen + :width: 800 + + Grafana's welcome screen. + +.. _grafana-datasource-setup: + +Configure the MongoDB data source +--------------------------------- + +You must configure your MongoDB data source in Grafana before first-time use. +Navigate to Grafana's **Configuration** page to add the "Omniperf Data" +connection. + +.. figure:: ../data/install/datasource_config.jpg + :align: center + :alt: Grafana data source configuration + :width: 800 + + Grafana's Configuration page. + +Configure the following fields in the data source settings. + +.. list-table:: + :stub-columns: 1 + + * - HTTP URL + - ``http://localhost:3333`` + + * - MongoDB URL + - ``mongodb://temp:temp123@\:27018/admin?authSource=admin`` + + * - Database Name + - ``admin`` + +After configuring these fields, click **Save & test** to make sure your +connection is successful. + +.. figure:: ../data/install/datasource_settings.jpg + :align: center + :alt: Grafana data source settings + :width: 800 + + Grafana data source settings. + +.. note:: + + To avoid potential DNS issues, you might need to use the actual IP address + for the host node in the MongoDB URL. + +.. _grafana-import-dashboard-file: + +Import the Omniperf dashboard file +---------------------------------- + +From the **Create** → **Import** page, upload the dashboard file, +``/dashboards/Omniperf_v{__VERSION__}_pub.json`` from the +:doc:`Omniperf tarball `. + +Edit both the dashboard **Name** and the **Unique identifier (UID)** fields to +uniquely identify the dashboard. Click **Import** to complete the process. + +.. figure:: ../data/install/import_dashboard.png + :align: center + :alt: Grafana's import dashboard + :width: 800 + + Grafana's Import dashboard. + +.. _grafana-select-workload: + +Select and load the Omniperf workload +------------------------------------- + +Once you have imported a dashboard you're ready to begin. Start by browsing +available dashboards and selecting the dashboard you have just imported. + +.. figure:: ../data/install/opening_dashboard.png + :align: center + :alt: Opening your Omniperf dashboard in Grafana + :width: 800 + + Opening your Omniperf profiling dashboard in Grafana. + +Remember that you need to upload workload data to the MongoDB backend before +analyzing in your Grafana interface. See a detailed example of this in +:ref:`grafana-gui-import`. + +After a workload has been successfully uploaded, you should be able to select it +from the workload dropdown located at the top of your Grafana dashboard. + +.. figure:: ../data/install/grafana_workload_selection.png + :align: center + :alt: Omniperf workload selection in Grafana + :width: 800 + + Selecting your Omniperf workload in Grafana. + +For more information on how to use the Grafana interface for analysis see +:doc:`/how-to/analyze/grafana-gui`. + diff --git a/projects/rocprofiler-compute/docs/license.rst b/projects/rocprofiler-compute/docs/license.rst new file mode 100644 index 0000000000..c423ed34f1 --- /dev/null +++ b/projects/rocprofiler-compute/docs/license.rst @@ -0,0 +1,10 @@ +.. meta:: + :description: Omniperf license + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + license + +******* +License +******* + +.. include:: ../LICENSE diff --git a/projects/rocprofiler-compute/docs/reference/compatible-accelerators.rst b/projects/rocprofiler-compute/docs/reference/compatible-accelerators.rst new file mode 100644 index 0000000000..b93c720324 --- /dev/null +++ b/projects/rocprofiler-compute/docs/reference/compatible-accelerators.rst @@ -0,0 +1,36 @@ +.. meta:: + :description: Omniperf support: compatible accelerators and GPUs + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, GPU + +*********************** +Compatible accelerators +*********************** + +The following table lists SoCs (System on Chip) tested for compatibility with +Omniperf. See :doc:`rocm:reference/gpu-arch-specs` for full AMD accelerator and +GPU specifications. + +.. _def-soc: + +.. note:: + + In Omniperf documentation, the term System on Chip (SoC) refers to a + particular family of AMD accelerators. + +.. list-table:: + :header-rows: 1 + + * - Platform + - Status + + * - AMD Instinct™ MI300 + - Supported ✅ + + * - AMD Instinct MI200 + - Supported ✅ + + * - AMD Instinct MI100 + - Supported ✅ + + * - AMD Instinct MI50, MI60 (Vega 20) + - No support ❌ diff --git a/projects/rocprofiler-compute/docs/reference/faq.rst b/projects/rocprofiler-compute/docs/reference/faq.rst new file mode 100644 index 0000000000..3cbbe778fc --- /dev/null +++ b/projects/rocprofiler-compute/docs/reference/faq.rst @@ -0,0 +1,85 @@ +.. meta:: + :description: Omniperf FAQ and troubleshooting + :keywords: Omniperf, FAQ, troubleshooting, ROCm, profiler, tool, Instinct, + accelerator, AMD, SSH, error, version, workaround, help + +*** +FAQ +*** + +Frequently asked questions and troubleshooting tips. + +How do I export profiling data I have already generated using Omniperf? +======================================================================= + +To interact with the Grafana GUI, you must sync data with the MongoDB +backend. You can do this using :ref:`database ` mode. + +Pass in the directory of your desired workload as follows. + +.. code-block:: shell + + $ omniperf database --import -w -H -u -t + +python ast error: 'Constant' object has no attribute 'kind' +=========================================================== + +This error arises from a bug in the default ``astunparse 1.6.3`` with +``python 3.8``. The error doesn't seem to occur with Python 3.7 or 3.9. + +Workaround: + +.. code-block:: shell + + $ pip3 uninstall astunparse + $ pip3 astunparse + +tabulate doesn't print properly +=============================== + +To get around this issue, set the following environment variables to update your +locale settings. + +.. code-block:: shell + + $ export LC_ALL=C.UTF-8 + $ export LANG=C.UTF-8 + +How can I SSH tunnel in MobaXterm? +================================== + +1. Open MobaXterm. +2. In the top ribbon, select **Tunneling** to access tunneling options. + + .. image:: ../data/faq/tunnel_demo1.png + :align: center + :alt: MobaXterm Tunnel button + :width: 800 + + This pop-up should appear. + + .. image:: ../data/faq/tunnel_demo2.png + :align: center + :alt: MobaXterm pop-up + :width: 800 + +3. Select **New SSH tunnel**. + + .. image:: ../data/faq/tunnel_demo3.png + :align: center + :alt: MobaXterm pop-up + :width: 800 + +4. Configure the SSH tunnel. + + Local clients + * ````: ``[PORT]`` + + Remote server + * ````: ``localhost`` + * ````: ``[PORT]`` + + SSH server + * ````: *name of the server to connect to* + * ````: *username to login to the server* + * ````: ``22`` diff --git a/projects/rocprofiler-compute/docs/sphinx/_toc.yml.in b/projects/rocprofiler-compute/docs/sphinx/_toc.yml.in new file mode 100644 index 0000000000..eb863b7a39 --- /dev/null +++ b/projects/rocprofiler-compute/docs/sphinx/_toc.yml.in @@ -0,0 +1,60 @@ +# Anywhere {branch} is used, the branch name will be substituted. +# These comments will also be removed. +defaults: + numbered: False + maxdepth: 6 +root: index +subtrees: + - entries: + - file: what-is-omniperf.rst + + - caption: Install + entries: + - file: install/core-install.rst + - file: install/grafana-setup.rst + title: Grafana server for Omniperf + + - caption: How to + entries: + - file: how-to/use.rst + - file: how-to/profile/mode.rst + - file: how-to/analyze/mode.rst + entries: + - file: how-to/analyze/cli.rst + - file: how-to/analyze/grafana-gui.rst + - file: how-to/analyze/standalone-gui.rst + + - caption: Conceptual + entries: + - file: conceptual/performance-model.rst + entries: + - file: conceptual/compute-unit.rst + title: Compute unit + entries: + - file: conceptual/pipeline-descriptions.rst + - file: conceptual/pipeline-metrics.rst + - file: conceptual/local-data-share.rst + title: Local data share + - file: conceptual/vector-l1-cache.rst + title: Vector L1 cache + - file: conceptual/l2-cache.rst + title: L2 cache + - file: conceptual/shader-engine.rst + title: Shader engine + - file: conceptual/command-processor.rst + title: Command processor + - file: conceptual/system-speed-of-light.rst + title: System Speed-of-Light + - file: conceptual/references.rst + - file: conceptual/definitions.rst + + - caption: Tutorials + entries: + - file: tutorial/profiling-by-example.rst + - file: tutorial/learning-resources.rst + + - caption: Reference + entries: + - file: reference/compatible-accelerators.rst + - file: reference/faq.rst + - file: license.rst diff --git a/projects/rocprofiler-compute/docs/sphinx/requirements.in b/projects/rocprofiler-compute/docs/sphinx/requirements.in new file mode 100644 index 0000000000..e503806ca1 --- /dev/null +++ b/projects/rocprofiler-compute/docs/sphinx/requirements.in @@ -0,0 +1,2 @@ +rocm-docs-core==1.6.1 +sphinxcontrib.datatemplates==0.11.0 diff --git a/projects/rocprofiler-compute/docs/sphinx/requirements.txt b/projects/rocprofiler-compute/docs/sphinx/requirements.txt new file mode 100644 index 0000000000..82d64eb291 --- /dev/null +++ b/projects/rocprofiler-compute/docs/sphinx/requirements.txt @@ -0,0 +1,156 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==0.7.16 + # via sphinx +babel==2.15.0 + # via + # pydata-sphinx-theme + # sphinx +beautifulsoup4==4.12.3 + # via pydata-sphinx-theme +breathe==4.35.0 + # via rocm-docs-core +certifi==2024.7.4 + # via requests +cffi==1.16.0 + # via + # cryptography + # pynacl +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via sphinx-external-toc +cryptography==43.0.0 + # via pyjwt +defusedxml==0.7.1 + # via sphinxcontrib-datatemplates +deprecated==1.2.14 + # via pygithub +docutils==0.21.2 + # via + # breathe + # myst-parser + # pydata-sphinx-theme + # sphinx +fastjsonschema==2.20.0 + # via rocm-docs-core +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via rocm-docs-core +idna==3.7 + # via requests +imagesize==1.4.1 + # via sphinx +jinja2==3.1.4 + # via + # myst-parser + # sphinx +markdown-it-py==3.0.0 + # via + # mdit-py-plugins + # myst-parser +markupsafe==2.1.5 + # via jinja2 +mdit-py-plugins==0.4.1 + # via myst-parser +mdurl==0.1.2 + # via markdown-it-py +myst-parser==3.0.1 + # via rocm-docs-core +packaging==24.1 + # via + # pydata-sphinx-theme + # sphinx +pycparser==2.22 + # via cffi +pydata-sphinx-theme==0.15.4 + # via + # rocm-docs-core + # sphinx-book-theme +pygithub==2.3.0 + # via rocm-docs-core +pygments==2.18.0 + # via + # accessible-pygments + # pydata-sphinx-theme + # sphinx +pyjwt[crypto]==2.8.0 + # via pygithub +pynacl==1.5.0 + # via pygithub +pyyaml==6.0.1 + # via + # myst-parser + # rocm-docs-core + # sphinx-external-toc + # sphinxcontrib-datatemplates +requests==2.32.3 + # via + # pygithub + # sphinx +rocm-docs-core==1.6.1 + # via -r requirements.in +smmap==5.0.1 + # via gitdb +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.5 + # via beautifulsoup4 +sphinx==7.4.7 + # via + # breathe + # myst-parser + # pydata-sphinx-theme + # rocm-docs-core + # sphinx-book-theme + # sphinx-copybutton + # sphinx-design + # sphinx-external-toc + # sphinx-notfound-page + # sphinxcontrib-datatemplates + # sphinxcontrib-runcmd +sphinx-book-theme==1.1.3 + # via rocm-docs-core +sphinx-copybutton==0.5.2 + # via rocm-docs-core +sphinx-design==0.6.0 + # via rocm-docs-core +sphinx-external-toc==1.0.1 + # via rocm-docs-core +sphinx-notfound-page==1.0.2 + # via rocm-docs-core +sphinxcontrib-applehelp==1.0.8 + # via sphinx +sphinxcontrib-datatemplates==0.11.0 + # via -r requirements.in +sphinxcontrib-devhelp==1.0.6 + # via sphinx +sphinxcontrib-htmlhelp==2.0.6 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==1.0.8 + # via sphinx +sphinxcontrib-runcmd==0.2.0 + # via sphinxcontrib-datatemplates +sphinxcontrib-serializinghtml==1.1.10 + # via sphinx +tomli==2.0.1 + # via sphinx +typing-extensions==4.12.2 + # via + # pydata-sphinx-theme + # pygithub +urllib3==2.2.2 + # via + # pygithub + # requests +wrapt==1.16.0 + # via deprecated diff --git a/projects/rocprofiler-compute/docs/sphinx/static/css/o_custom.css b/projects/rocprofiler-compute/docs/sphinx/static/css/o_custom.css new file mode 100644 index 0000000000..a6cbe5718f --- /dev/null +++ b/projects/rocprofiler-compute/docs/sphinx/static/css/o_custom.css @@ -0,0 +1,30 @@ +:root { + --amd-teal-500: #00C2DE; + --amd-teal-750: #00788E; +} + +/* Override PyData Sphinx Theme default colors */ +html[data-theme='light'] { + --pst-color-primary: var(--amd-teal-750); + --pst-color-primary-bg: var(--amd-teal-500); + --pst-color-table-row-hover-bg: #E2E8F0; +} + +html[data-theme='dark'] { + --pst-color-primary: var(--amd-teal-500); + --pst-color-primary-bg: var(--amd-teal-750); + --pst-color-table-row-hover-bg: #1E293B; +} + +html[data-theme='light'], +html[data-theme='dark'] { + --pst-color-link: var(--pst-color-primary); +} + +a svg { + color: var(--pst-color-text-base); +} + +a svg:hover { + color: var(--pst-color-link-hover); +} diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/infinity-fabric-transactions.rst b/projects/rocprofiler-compute/docs/tutorial/includes/infinity-fabric-transactions.rst new file mode 100644 index 0000000000..b60355bf7b --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/infinity-fabric-transactions.rst @@ -0,0 +1,675 @@ +.. _infinity-fabric-example: + +Infinity Fabric transactions +============================ + + For this example, consider the + :dev-sample:`Infinity Fabric™ sample ` distributed as a part of + Omniperf. + +This following code snippet launches a simple read-only kernel. + +.. code-block:: cpp + + // the main streaming kernel + __global__ void kernel(int* x, size_t N, int zero) { + int sum = 0; + const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = 0; i < 10; ++i) { + for (size_t offset = offset_start; offset < N; offset += blockDim.x * gridDim.x) { + sum += x[offset]; + } + } + if (sum != 0) { + x[offset_start] = sum; + } + } + +This happens twice -- once as a warm-up and once for analysis. Note that the +buffer ``x`` is initialized to all zeros via a call to ``hipMemcpy`` on the +host before the kernel is ever launched. Therefore, the following conditional +is identically false -- and thus we expect no writes. + +.. code-block:: cpp + + if (sum != 0) { ... + +.. note:: + + The actual sample included with Omniperf also includes the ability to select + different operation types (such as atomics, writes). This abbreviated version + is presented here for reference only. + +Finally, this sample code lets the user control the +:ref:`granularity of an allocation `, the owner of an allocation +(local HBM, CPU DRAM or remote HBM), and the size of an allocation (the default +is :math:`\sim4`\ GiB) via command line arguments. In doing so, we can explore +the impact of these parameters on the L2-Fabric metrics reported by Omniperf to +further understand their meaning. + +.. note:: + + All results in this section were generated an a node of Infinity + Fabric connected MI250 accelerators using ROCm version 5.6.0, and Omniperf + version 2.0.0. Although results may vary with ROCm versions and accelerator + connectivity, we expect the lessons learned here to be broadly applicable. + +.. _infinity-fabric-ex1: + +Experiment 1: Coarse-grained, accelerator-local HBM reads +----------------------------------------------------------- + +In our first experiment, we consider the simplest possible case, a +``hipMalloc``\ ’d buffer that is local to our current accelerator: + +.. code-block:: shell-session + + $ omniperf profile -n coarse_grained_local --no-roof -- ./fabric -t 1 -o 0 + Using: + mtype:CoarseGrained + mowner:Device + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/coarse_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.0 │ L2-Fabric Read BW │ 42947428672.00 │ 42947428672.00 │ 42947428672.00 │ Bytes per kernel │ + ├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.1 │ HBM Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.2 │ Remote Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.07 │ 0.07 │ 0.07 │ Pct │ + ╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ + │ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.1 │ Read (Uncached) │ 1450.00 │ 1450.00 │ 1450.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.2 │ Read (64B) │ 671053573.00 │ 671053573.00 │ 671053573.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.3 │ HBM Read │ 671053565.00 │ 671053565.00 │ 671053565.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.4 │ Remote Read │ 8.00 │ 8.00 │ 8.00 │ Req per kernel │ + ╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ + +Here, you can make the following observations. + +- The vast majority of L2-Fabric requests (>99%) are 64B + read requests (**17.5.2**). + +- Nearly 100% of the read requests (**17.2.1**) are homed in on the + accelerator-local HBM (**17.5.3**), while some small fraction of these reads are + routed to a “remote” device (**17.5.4**). + +- These drive a :math:`\sim40`\ GiB per kernel read-bandwidth (**17.2.0**). + +In addition, we see a small amount of :ref:`uncached ` reads +(**17.5.1**), these correspond to things like: + +* The assembly code to execute the kernel + +* Kernel arguments + +* Coordinate parameters (such as ``blockDim.z``) that were not initialized by the + hardware, etc. and may account for some of our "remote" read requests + (**17.5.4**), for example, reading from CPU DRAM + +The above list is not exhaustive, nor are all of these guaranteed to be +"uncached" – the exact implementation depends on the accelerator and +ROCm versions used. These read requests could be interrogated further in +the :ref:`Scalar L1 Data Cache ` and +:ref:`Instruction Cache ` metric sections. + +.. note:: + + The Traffic metrics in Sec **17.2** are presented as a percentage of the total + number of requests. For example, "HBM Read Traffic" is the percent of read requests + (**17.5.0** - **17.5.2**) that were directed to the accelerators' local HBM (**17.5.3**). + +.. _infinity-fabric-ex2: + +Experiment 2: Fine-grained, accelerator-local HBM reads +--------------------------------------------------------- + +In this experiment, we change the :ref:`granularity ` of our +device-allocation to be fine-grained device memory, local to the current +accelerator. Our code uses the ``hipExtMallocWithFlag`` API with the +``hipDeviceMallocFinegrained`` flag to accomplish this. + +.. note:: + + On some systems (such as those with only PCIe® connected accelerators), you need + to set the environment variable ``HSA_FORCE_FINE_GRAIN_PCIE=1`` to enable + this memory type. + +.. code-block:: shell-session + + $ omniperf profile -n fine_grained_local --no-roof -- ./fabric -t 0 -o 0 + Using: + mtype:FineGrained + mowner:Device + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/fine_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.0 │ L2-Fabric Read BW │ 42948661824.00 │ 42948661824.00 │ 42948661824.00 │ Bytes per kernel │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.1 │ HBM Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.2 │ Remote Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.3 │ Uncached Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.07 │ 0.07 │ 0.07 │ Pct │ + ╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ + │ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.1 │ Read (Uncached) │ 1334.00 │ 1334.00 │ 1334.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.2 │ Read (64B) │ 671072841.00 │ 671072841.00 │ 671072841.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.3 │ HBM Read │ 671072835.00 │ 671072835.00 │ 671072835.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.4 │ Remote Read │ 6.00 │ 6.00 │ 6.00 │ Req per kernel │ + ╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ + +Comparing with our :ref:`previous example `, we see a +relatively similar result, namely: + +- The vast majority of L2-Fabric requests are 64B read requests (**17.5.2**) + +- Nearly all these read requests are directed to the accelerator-local HBM (**17.2.1**) + +In addition, we now see a small percentage of HBM Read Stalls (**17.4.2**), +as streaming fine-grained memory is putting more stress on Infinity +Fabric. + +.. note:: + + The stalls in Sec 17.4 are presented as a percentage of the total number + active L2 cycles, summed over :doc:`all L2 channels `. + +.. _infinity-fabric-ex3: + +Experiment 3: Fine-grained, remote-accelerator HBM reads +---------------------------------------------------------- + +In this experiment, we move our :ref:`fine-grained ` allocation to +be owned by a remote accelerator. We accomplish this by first changing +the HIP device using, for instance, the ``hipSetDevice(1)`` API, then allocating +fine-grained memory (as described :ref:`previously `), and +finally resetting the device back to the default, for instance, +``hipSetDevice(0)``. + +Although we have not changed our code significantly, we do see a +substantial change in the L2-Fabric metrics: + +.. code-block:: shell-session + + $ omniperf profile -n fine_grained_remote --no-roof -- ./fabric -t 0 -o 2 + Using: + mtype:FineGrained + mowner:Remote + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/fine_grained_remote/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.0 │ L2-Fabric Read BW │ 42949692736.00 │ 42949692736.00 │ 42949692736.00 │ Bytes per kernel │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.3 │ Uncached Read Traffic │ 200.00 │ 200.00 │ 200.00 │ Pct │ + ╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 17.85 │ 17.85 │ 17.85 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡ + │ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.1 │ Read (Uncached) │ 1342177894.00 │ 1342177894.00 │ 1342177894.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.2 │ Read (64B) │ 671088949.00 │ 671088949.00 │ 671088949.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.3 │ HBM Read │ 307.00 │ 307.00 │ 307.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.4 │ Remote Read │ 671088642.00 │ 671088642.00 │ 671088642.00 │ Req per kernel │ + ╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛ + +First, we see that while we still observe approximately the same number +of 64B Read Requests (**17.5.2**), we now see an even larger number of +Uncached Read Requests (**17.5.3**). Some simple division reveals: + +.. math:: + + 342177894.00 / 671088949.00 ≈ 2 + +That is, each 64B Read Request is *also* counted as two Uncached Read +Requests, as reflected in the :ref:`request-flow diagram `. +This is also why the Uncached Read Traffic metric (**17.2.3**) is at the +counter-intuitive value of 200%! + +In addition, observe that: + +- We no longer see any significant number of HBM Read Requests (**17.2.1**, + **17.5.3**), nor HBM Read Stalls (**17.4.2**), but instead, + +- we see that almost all of these requests are considered “remote” + (**17.2.2**, **17.5.4**) are being routed to another + accelerator, or the CPU — in this case HIP Device 1 — and, + +- we see a significantly larger percentage of AMD Infinity Fabric Read Stalls + (**17.4.1**) as compared to the HBM Read Stalls in the + :ref:`previous example `. + +These stalls correspond to reads that are going out over the AMD +Infinity Fabric connection to another MI250 accelerator. In +addition, because these are crossing between accelerators, we expect +significantly lower achievable bandwidths as compared to the local +accelerator’s HBM – this is reflected (indirectly) in the magnitude of +the stall metric (**17.4.1**). Finally, we note that if our system contained +only PCIe connected accelerators, these observations will differ. + +.. _infinity-fabric-ex4: + +Experiment 4: Fine-grained, CPU-DRAM reads +-------------------------------------------- + +In this experiment, we move our :ref:`fine-grained ` allocation to +be owned by the CPU’s DRAM. We accomplish this by allocating host-pinned +fine-grained memory using the ``hipHostMalloc`` API: + +.. code-block:: shell-session + + $ omniperf profile -n fine_grained_host --no-roof -- ./fabric -t 0 -o 1 + Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/fine_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.0 │ L2-Fabric Read BW │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.3 │ Uncached Read Traffic │ 200.00 │ 200.00 │ 200.00 │ Pct │ + ╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 91.29 │ 91.29 │ 91.29 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡ + │ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.1 │ Read (Uncached) │ 1342177848.00 │ 1342177848.00 │ 1342177848.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.2 │ Read (64B) │ 671088926.00 │ 671088926.00 │ 671088926.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.3 │ HBM Read │ 284.00 │ 284.00 │ 284.00 │ Req per kernel │ + ├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ + │ 17.5.4 │ Remote Read │ 671088642.00 │ 671088642.00 │ 671088642.00 │ Req per kernel │ + ╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛ + +Here we see *almost* the same results as in the +:ref:`previous experiment `, however now as we are crossing +a PCIe bus to the CPU, we see that the Infinity Fabric Read stalls (**17.4.1**) +have shifted to be a PCIe stall (**17.4.2**). In addition, as (on this +system) the PCIe bus has a lower peak bandwidth than the AMD Infinity +Fabric connection between two accelerators, we once again observe an +increase in the percentage of stalls on this interface. + +.. note:: + + Had we performed this same experiment on an + `MI250X system `_, + these transactions would again have been marked as Infinity Fabric Read + stalls (**17.4.1**), as the CPU is connected to the accelerator via AMD Infinity + Fabric. + +.. _infinity-fabric-ex5: + +Experiment 5: Coarse-grained, CPU-DRAM reads +---------------------------------------------- + +In our next fabric experiment, we change our CPU memory allocation to be +`coarse-grained `__. We accomplish this by passing the +``hipHostMalloc`` API the ``hipHostMallocNonCoherent`` flag, to mark the +allocation as coarse-grained: + +.. code-block:: shell-session + + $ omniperf profile -n coarse_grained_host --no-roof -- ./fabric -t 1 -o 1 + Using: + mtype:CoarseGrained + mowner:Host + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/coarse_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.0 │ L2-Fabric Read BW │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.3 │ Uncached Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 91.27 │ 91.27 │ 91.27 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ + │ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.1 │ Read (Uncached) │ 562.00 │ 562.00 │ 562.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.2 │ Read (64B) │ 671088926.00 │ 671088926.00 │ 671088926.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.3 │ HBM Read │ 281.00 │ 281.00 │ 281.00 │ Req per kernel │ + ├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.4 │ Remote Read │ 671088645.00 │ 671088645.00 │ 671088645.00 │ Req per kernel │ + ╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ + +Here we see a similar result to our +:ref:`previous experiment `, with one key difference: our +accesses are no longer marked as Uncached Read requests (**17.2.3, 17.5.1**), but instead +are 64B read requests (**17.5.2**), as observed in our +:ref:`Coarse-grained, accelerator-local HBM ` experiment. + +.. _infinity-fabric-ex6: + +Experiment 6: Fine-grained, CPU-DRAM writes +-------------------------------------------- + +Thus far in our exploration of the L2-Fabric interface, we have +primarily focused on read operations. However, in +:ref:`our request flow diagram `, we note that writes are +counted separately. To observe this, we use the ``-p`` flag to trigger write +operations to fine-grained memory allocated on the host: + +.. code-block:: shell-session + + $ omniperf profile -n fine_grained_host_write --no-roof -- ./fabric -t 0 -o 1 -p 1 + Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Write + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/fine_grained_host_writes/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ + │ 17.2.4 │ L2-Fabric Write and Atomic BW │ 42949672960.00 │ 42949672960.00 │ 42949672960.00 │ Bytes per kernel │ + ├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.5 │ HBM Write and Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.6 │ Remote Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.7 │ Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ + │ 17.2.8 │ Uncached Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ╘═════════╧═══════════════════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.3 │ Write - PCIe Stall │ PCIe Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.4 │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.5 │ Write - HBM Stall │ HBM Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.6 │ Write - Credit Starvation │ Credit Starvation │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ + │ 17.5.5 │ Write (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.6 │ Write (Uncached) │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.7 │ Write (64B) │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.8 │ HBM Write and Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.9 │ Remote Write and Atomic │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ + │ 17.5.10 │ Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ╘═════════╧═════════════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ + +Here we notice a few changes in our request pattern: + +* As expected, the requests have changed from 64B Reads to 64B Write requests + (**17.5.7**), + +* these requests are homed in on a “remote” destination (**17.2.6, 17.5.9**), as + expected, and + +* these are also counted as a single Uncached Write request (**17.5.6**). + +In addition, there are rather significant changes in the bandwidth values +reported: + +- The “L2-Fabric Write and Atomic” bandwidth metric (**17.2.4**) + reports about 40GiB of data written across Infinity Fabric while + +- The “Remote Write and Traffic” metric (**17.2.5**) indicates that nearly + 100% of these request are being directed to a remote source. + +The precise meaning of these metrics are explored in the +:ref:`subsequent experiment `. + +Finally, we note that we see no write stalls on the PCIe bus +(**17.4.3**). This is because writes over a PCIe bus `are +non-posted `_, +that is, they do not require acknowledgement. + +.. _infinity-fabric-ex7: + +Experiment 7: Fine-grained, CPU-DRAM atomicAdd +------------------------------------------------ + +Next, we change our experiment to instead target ``atomicAdd`` +operations to the CPU’s DRAM. + +.. code-block:: shell-session + + $ omniperf profile -n fine_grained_host_add --no-roof -- ./fabric -t 0 -o 1 -p 2 + Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Add + mdata:Unsigned + remoteId:-1 + <...> + $ omniperf analyze -p workloads/fine_grained_host_add/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2 + <...> + 17. L2 Cache + 17.2 L2 - Fabric Transactions + ╒═════════╤═══════════════════════════════════╤══════════════╤══════════════╤══════════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════════════════╪══════════════╪══════════════╪══════════════╪══════════════════╡ + │ 17.2.4 │ L2-Fabric Write and Atomic BW │ 429496736.00 │ 429496736.00 │ 429496736.00 │ Bytes per kernel │ + ├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ + │ 17.2.5 │ HBM Write and Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ + │ 17.2.6 │ Remote Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ + │ 17.2.7 │ Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ + │ 17.2.8 │ Uncached Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ + ╘═════════╧═══════════════════════════════════╧══════════════╧══════════════╧══════════════╧══════════════════╛ + 17.4 L2 - Fabric Interface Stalls + ╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ + │ 17.4.3 │ Write - PCIe Stall │ PCIe Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.4 │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.5 │ Write - HBM Stall │ HBM Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ + │ 17.4.6 │ Write - Credit Starvation │ Credit Starvation │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ + 17.5 L2 - Fabric Detailed Transaction Breakdown + ╒═════════╤═════════════════════════╤═════════════╤═════════════╤═════════════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════════╪═════════════╪═════════════╪═════════════╪════════════════╡ + │ 17.5.5 │ Write (32B) │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ + │ 17.5.6 │ Write (Uncached) │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ + │ 17.5.7 │ Write (64B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ + │ 17.5.8 │ HBM Write and Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ + │ 17.5.9 │ Remote Write and Atomic │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ + ├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ + │ 17.5.10 │ Atomic │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ + ╘═════════╧═════════════════════════╧═════════════╧═════════════╧═════════════╧════════════════╛ + +In this case, there is quite a lot to unpack: + +- For the first time, the 32B Write requests (**17.5.5**) are heavily used. + +- These correspond to Atomic requests (**17.2.7, 17.5.10**), and are counted as + Uncached Writes (**17.5.6**). + +- The L2-Fabric Write and Atomic bandwidth metric (**17.2.4**) shows about 0.4 + GiB of traffic. For convenience, the sample reduces the default problem size + for this case due to the speed of atomics across a PCIe bus, and finally, + +- The traffic is directed to a remote device (**17.2.6, 17.5.9**). + +Let's consider what an “atomic” request means in this context. Recall +that we are discussing memory traffic flowing from the L2 cache, the +device-wide coherence point on current CDNA accelerators such as the +MI250, to for example, the CPU’s DRAM. In this light, we see that these +requests correspond to *system scope* atomics, and specifically in the +case of the MI250, to fine-grained memory. + + +.. rubric:: Disclaimer + +PCIe® is a registered trademark of PCI-SIG Corporation. + +.. + `Leave as possible future experiment to add + + + ### Experiment #2 - Non-temporal writes + + If we take the same code (for convenience only) as previously described, we can demonstrate how to achieve 'streaming' writes, as described in the [L2 Cache Access metrics](L2_cache_metrics) section. + To see this, we use the Clang built-in [`__builtin_nontemporal_store`](https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins), for example + + ``` + template + __device__ void store (T* ptr, T val) { + __builtin_nontemporal_store(val, ptr); + } + ``` + + On an AMD MI2XX accelerator, for FP32 values this will generate a `global_store_dword` instruction, with the `glc` and `slc` bits set, described in [section 10.1](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf) of the CDNA2 ISA guide.` diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst b/projects/rocprofiler-compute/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst new file mode 100644 index 0000000000..dcbf372663 --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst @@ -0,0 +1,486 @@ +.. _ipc-example: + +Instructions-per-cycle and utilizations example +=============================================== + +For this example, consider the +:dev-sample:`instructions-per-cycle (IPC) example ` included with +Omniperf. + +This example is compiled using ``c++17`` support: + +.. code-block:: shell + + $ hipcc -O3 ipc.hip -o ipc -std=c++17 + +and was run on an MI250 CDNA2 accelerator: + +.. code-block:: shell + + $ omniperf profile -n ipc --no-roof -- ./ipc + +The results shown in this section are *generally* applicable to CDNA +accelerators, but may vary between generations and specific products. + +.. _ipc-experiment-design-note: + +Design note +----------- + +The kernels in this example all execute a specific assembly operation +``N`` times (1000, by default), for instance the ``vmov`` kernel: + +.. code-block:: cpp + + template + __device__ void vmov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy)); + vmov_op(); + } + } + + template + __global__ void vmov() { + vmov_op(); + } + +The kernels are then launched twice, once for a warm-up run, and once +for measurement. + +.. _ipc-valu-utilization: + +VALU utilization and IPC +------------------------ + +Now we can use our test to measure the achieved instructions-per-cycle +of various types of instructions. We start with a simple :ref:`VALU ` +operation, i.e., a ``v_mov_b32`` instruction, e.g.: + +.. code-block:: asm + + v_mov_b32 v0, v1 + +This instruction simply copies the contents from the source register +(``v1``) to the destination register (``v0``). Investigating this kernel +with Omniperf, we see: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 7 -b 11.2 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ + │ 0 │ void vmov<1000>() [clone .kd] │ 1.00 │ 99317423.00 │ 99317423.00 │ 99317423.00 │ 100.00 │ + ╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 11. Compute Units - Compute Pipeline + 11.2 Pipeline Stats + ╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ + │ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.2 │ SALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.3 │ VALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.5 │ Branch Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.6 │ VALU Active Threads │ 64.0 │ 64.0 │ 64.0 │ Threads │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ + ╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ + +Here we see that: + +1. Both the IPC (**11.2.0**) and “Issued” IPC (**11.2.1**) metrics are + :math:`\sim 1` +2. The VALU Utilization metric (**11.2.3**) is also :math:`\sim100\%`, and + finally +3. The VALU Active Threads metric (**11.2.4**) is 64, i.e., the wavefront + size on CDNA accelerators, as all threads in the wavefront are + active. + +We will explore the difference between the IPC (**11.2.0**) and “Issued” IPC +(**11.2.1**) metrics in the :ref:`next section `. + +Additionally, we notice a small (0.1%) Branch utilization (**11.2.5**). +Inspecting the assembly of this kernel shows there are no branch +operations, however recalling the note in the :ref:`Pipeline +statistics ` section: + + The branch utilization <…> includes time spent in other instruction + types (namely: ``s_endpgm``) that are *typically* a very small + percentage of the overall kernel execution. + +We see that this is coming from execution of the ``s_endpgm`` +instruction at the end of every wavefront. + +.. note:: + + Technically, the cycle counts used in the denominators of our IPC metrics are + actually in units of quad-cycles, a group of 4 consecutive cycles. However, a + typical :ref:`VALU ` instruction on CDNA accelerators runs for a + single quad-cycle (see :gcn-crash-course:`30`). Therefore, for simplicity, we + simply report these metrics as "instructions per cycle". + +.. _issued-ipc: + +Exploring “issued” IPC via MFMA operations +------------------------------------------ + +.. warning:: + + The MFMA assembly operations used in this example are inherently not portable + to older CDNA architectures. + +Unlike the simple quad-cycle ``v_mov_b32`` operation discussed in our +:ref:`previous example `, some operations take many +quad-cycles to execute. For example, using the +`AMD Matrix Instruction Calculator `_ +we can see that some :ref:`MFMA ` operations take 64 cycles, e.g.: + +.. code-block:: shell + + $ ./matrix_calculator.py --arch CDNA2 --detail-instruction --instruction v_mfma_f32_32x32x8bf16_1k + Architecture: CDNA2 + Instruction: V_MFMA_F32_32X32X8BF16_1K + <...> + Execution statistics: + FLOPs: 16384 + Execution cycles: 64 + FLOPs/CU/cycle: 1024 + Can co-execute with VALU: True + VALU co-execution cycles possible: 60 + +What happens to our IPC when we utilize this ``v_mfma_f32_32x32x8bf16_1k`` +instruction on a CDNA2 accelerator? To find out, we turn to our ``mfma`` kernel +in the IPC example: + +.. code-block:: shell + + $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 8 -b 11.2 --decimal 4 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═══════════════════════════════╤═════════╤═════════════════╤═════════════════╤═════════════════╤══════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═══════════════════════════════╪═════════╪═════════════════╪═════════════════╪═════════════════╪══════════╡ + │ 0 │ void mfma<1000>() [clone .kd] │ 1.0000 │ 1623167595.0000 │ 1623167595.0000 │ 1623167595.0000 │ 100.0000 │ + ╘════╧═══════════════════════════════╧═════════╧═════════════════╧═════════════════╧═════════════════╧══════════╛ + + + -------------------------------------------------------------------------------- + 11. Compute Units - Compute Pipeline + 11.2 Pipeline Stats + ╒═════════╤═════════════════════╤═════════╤═════════╤═════════╤══════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪═════════╪═════════╪═════════╪══════════════╡ + │ 11.2.0 │ IPC │ 0.0626 │ 0.0626 │ 0.0626 │ Instr/cycle │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.1 │ IPC (Issued) │ 1.0000 │ 1.0000 │ 1.0000 │ Instr/cycle │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.2 │ SALU Util │ 0.0000 │ 0.0000 │ 0.0000 │ Pct │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.3 │ VALU Util │ 6.2496 │ 6.2496 │ 6.2496 │ Pct │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.4 │ VMEM Util │ 0.0000 │ 0.0000 │ 0.0000 │ Pct │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.5 │ Branch Util │ 0.0062 │ 0.0062 │ 0.0062 │ Pct │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.6 │ VALU Active Threads │ 64.0000 │ 64.0000 │ 64.0000 │ Threads │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.7 │ MFMA Util │ 99.9939 │ 99.9939 │ 99.9939 │ Pct │ + ├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ + │ 11.2.8 │ MFMA Instr Cycles │ 64.0000 │ 64.0000 │ 64.0000 │ Cycles/instr │ + ╘═════════╧═════════════════════╧═════════╧═════════╧═════════╧══════════════╛ + +In contrast to our :ref:`VALU IPC example `, we now see +that the IPC metric (**11.2.0**) and Issued IPC (**11.2.1**) metric differ +substantially. First, we see the VALU utilization (**11.2.3**) has decreased +substantially, from nearly 100% to :math:`\sim6.25\%`. We note that this matches +the ratio of: :math:`((Execution\ cycles) - (VALU\ coexecution\ cycles)) / (Execution\ cycles)` +reported by the matrix calculator, while the MFMA utilization (**11.2.7**) +has increased to nearly 100%. + +Recall that our ``v_mfma_f32_32x32x8bf16_1k`` instruction takes 64 cycles to +execute, or 16 quad-cycles, matching our observed MFMA Instruction +Cycles (**11.2.8**). That is, we have a single instruction executed every 16 +quad-cycles, or :math:`1/16 = 0.0625`, which is almost identical to our IPC +metric (**11.2.0**). Why then is the Issued IPC metric (**11.2.1**) equal to 1.0? + +Instead of simply counting the number of instructions issued and +dividing by the number of cycles the :doc:`CUs ` on +the accelerator were active (as is done for **11.2.0**), this metric is formulated +differently, and instead counts the number of +(non-:ref:`internal `) instructions issued divided +by the number of (quad-) cycles where the :ref:`scheduler ` was +actively working on issuing instructions. Thus the Issued IPC metric +(**11.2.1**) gives more of a sense of “what percent of the total number of +:ref:`scheduler ` cycles did a wave schedule an instruction?” +while the IPC metric (**11.2.0**) indicates the ratio of the number of +instructions executed over the total +:ref:`active CU cycles `. + +.. warning:: + + There are further complications of the Issued IPC metric (**11.2.1**) that make + its use more complicated. We will be explore that in the + :ref:`following section `. For these reasons, + Omniperf typically promotes use of the regular IPC metric (**11.2.0**), e.g., in + the top-level Speed-of-Light chart. + +.. _ipc-internal-instructions: + +Internal instructions and IPC +----------------------------- + +Next, we explore the concept of an “internal” instruction. From +:gcn-crash-course:`29`, we see a few candidates for internal instructions, and +we choose a ``s_nop`` instruction, which according to the +:mi200-isa-pdf:`CDNA2 ISA guide <>`: + + Does nothing; it can be repeated in hardware up to eight times. + +Here we choose to use the following no-op to make our point: + +.. code-block:: asm + + s_nop 0x0 + +Running this kernel through Omniperf yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 9 -b 11.2 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ + │ 0 │ void snop<1000>() [clone .kd] │ 1.00 │ 14221851.50 │ 14221851.50 │ 14221851.50 │ 100.00 │ + ╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 11. Compute Units - Compute Pipeline + 11.2 Pipeline Stats + ╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ + │ 11.2.0 │ IPC │ 6.79 │ 6.79 │ 6.79 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.2 │ SALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.3 │ VALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.5 │ Branch Util │ 0.68 │ 0.68 │ 0.68 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.6 │ VALU Active Threads │ │ │ │ Threads │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ + ╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ + +First, we see that the IPC metric (**11.2.0**) tops our theoretical maximum +of 5 instructions per cycle (discussed in the :ref:`scheduler ` +section). How can this be? + +Recall that :gcn-crash-course:`27` say “no functional unit” for the internal +instructions. This removes the limitation on the IPC. If we are *only* +issuing internal instructions, we are not issuing to any execution +units! However, workloads such as these are almost *entirely* artificial +(that is, repeatedly issuing internal instructions almost exclusively). In +practice, a maximum of IPC of 5 is expected in almost all cases. + +Secondly, note that our “Issued” IPC (**11.2.1**) is still identical to +the one here. Again, this has to do with the details of “internal” +instructions. Recall in our :ref:`previous example ` we defined +this metric as explicitly excluding internal instruction counts. The +logical question then is, "what *is* this metric counting in our +``s_nop`` kernel?" + +The generated assembly looks something like: + +.. code-block:: asm + + ;;#ASMSTART + s_nop 0x0 + ;;#ASMEND + ;;#ASMSTART + s_nop 0x0 + ;;#ASMEND + ;;<... omitting many more ...> + s_endpgm + .section .rodata,#alloc + .p2align 6, 0x0 + .amdhsa_kernel _Z4snopILi1000EEvv + +Of particular interest here is the ``s_endpgm`` instruction, of which +the `CDNA2 ISA +guide `__ +states: + + End of program; terminate wavefront. + +This is not on our list of internal instructions from +:gcn-crash-course:`The AMD GCN Architecture <>`, and is therefore counted as part +of our Issued IPC (**11.2.1**). Thus, the issued IPC being equal to one here +indicates that we issued an ``s_endpgm`` instruction every cycle the +:ref:`scheduler ` was active for non-internal instructions, which +is expected as this was our *only* non-internal instruction. + +SALU Utilization +---------------- + +Next, we explore a simple :ref:`SALU ` kernel in our on-going IPC and +utilization example. For this case, we select a simple scalar move +operation, for instance: + +.. code-block:: asm + + s_mov_b32 s0, s1 + +which, in analogue to our :ref:`v_mov ` example, copies the +contents of the source scalar register (``s1``) to the destination +scalar register (``s0``). Running this kernel through Omniperf yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 10 -b 11.2 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ + │ 0 │ void smov<1000>() [clone .kd] │ 1.00 │ 96246554.00 │ 96246554.00 │ 96246554.00 │ 100.00 │ + ╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 11. Compute Units - Compute Pipeline + 11.2 Pipeline Stats + ╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ + │ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.2 │ SALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.3 │ VALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.5 │ Branch Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.6 │ VALU Active Threads │ │ │ │ Threads │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ + ╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ + +Here we see that: + +- Both our IPC (**11.2.0**) and Issued IPC (**11.2.1**) are + :math:`\sim1.0` as expected, and + +- The SALU Utilization (**11.2.2**) was + nearly 100% as it was active for almost the entire kernel. + +VALU Active Threads +------------------- + +For our final IPC/Utilization example, we consider a slight modification +of our :ref:`v_mov ` example: + +.. code-block:: cpp + + template + __global__ void vmov_with_divergence() { + if (threadIdx.x % 64 == 0) + vmov_op(); + } + +That is, we wrap our :ref:`VALU ` operation inside a conditional +where only one lane in our wavefront is active. Running this kernel +through Omniperf yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 11 -b 11.2 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤══════════════════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ + │ 0 │ void vmov_with_divergence<1000>() [clone │ 1.00 │ 97125097.00 │ 97125097.00 │ 97125097.00 │ 100.00 │ + │ │ .kd] │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 11. Compute Units - Compute Pipeline + 11.2 Pipeline Stats + ╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ + │ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.2 │ SALU Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.3 │ VALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.5 │ Branch Util │ 0.2 │ 0.2 │ 0.2 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.6 │ VALU Active Threads │ 1.13 │ 1.13 │ 1.13 │ Threads │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ + ├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ + │ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ + ╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ + +Here we see that once again, our VALU Utilization (**11.2.3**) is nearly +100%. However, we note that the VALU Active Threads metric (**11.2.6**) is +:math:`\sim 1`, which matches our conditional in the source code. So +VALU Active Threads reports the average number of lanes of our wavefront +that are active over all :ref:`VALU ` instructions, or thread +“convergence” (i.e., 1 - :ref:`divergence `). + +.. note:: + + 1. The act of evaluating a vector conditional in this example typically triggers VALU operations, contributing to why the VALU Active Threads metric is not identically one. + 2. This metric is a time (cycle) averaged value, and thus contains an implicit dependence on the duration of various VALU instructions. + + Nonetheless, this metric serves as a useful measure of thread-convergence. + +Finally, we note that our branch utilization (**11.2.5**) has increased +slightly from our baseline, as we now have a branch (checking the value +of ``threadIdx.x``). diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/lds-examples.rst b/projects/rocprofiler-compute/docs/tutorial/includes/lds-examples.rst new file mode 100644 index 0000000000..f6cff7b722 --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/lds-examples.rst @@ -0,0 +1,272 @@ +.. _lds-examples: + +LDS examples +============ + +For this example, consider the +:dev-sample:`LDS sample ` distributed as a part of Omniperf. This +code contains two kernels to explore how both :doc:`LDS ` bandwidth and +bank conflicts are calculated in Omniperf. + +This example was compiled and run on an MI250 accelerator using ROCm +v5.6.0, and Omniperf v2.0.0. + +.. code-block:: shell-session + + $ hipcc -O3 lds.hip -o lds + +Finally, we generate our ``omniperf profile`` as: + +.. code-block:: shell-session + + $ omniperf profile -n lds --no-roof -- ./lds + +.. _lds-bandwidth: + +LDS bandwidth +------------- + +To explore our *theoretical LDS bandwidth* metric, we use a simple +kernel: + +.. code-block:: cpp + + constexpr unsigned max_threads = 256; + __global__ void load(int* out, int flag) { + __shared__ int array[max_threads]; + int index = threadIdx.x; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; + } + +Here we: + +* Create an array of 256 integers in :doc:`LDS ` + +* Fake a write to the LDS using the ``flag`` variable (always set to zero on the + host) to avoid dead-code elimination + +* Read a single integer per work-item from ``threadIdx.x`` of the LDS array + +* If the integer is equal to a magic number (always false), write the value out + to global memory to again, avoid dead-code elimination + +Finally, we launch this kernel repeatedly, varying the number of threads +in our workgroup: + +.. code-block:: cpp + + void bandwidth_demo(int N) { + for (int i = 1; i <= N; ++i) + load<<<1,i>>>(nullptr, 0); + hipDeviceSynchronize(); + } + +Next, let’s analyze the first of our bandwidth kernel dispatches: + +.. code-block:: shell + + $ omniperf analyze -p workloads/lds/mi200/ -b 12.2.1 --dispatch 0 -n per_kernel + <...> + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤═══════════════════════╤════════╤════════╤════════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪════════╪════════╪════════╪══════════════════╡ + │ 12.2.1 │ Theoretical Bandwidth │ 256.00 │ 256.00 │ 256.00 │ Bytes per kernel │ + ╘═════════╧═══════════════════════╧════════╧════════╧════════╧══════════════════╛ + +Here we see that our Theoretical Bandwidth metric (**12.2.1**) is reporting +256 Bytes were loaded even though we launched a single work-item +workgroup, and thus only loaded a single integer from LDS. Why is this? + +Recall our definition of this metric: + + Indicates the maximum amount of bytes that could have been loaded + from/stored to/atomically updated in the LDS per + :ref:`normalization unit `. + +Here we see that this instruction *could* have loaded up to 256 bytes of +data (4 bytes for each work-item in the wavefront), and therefore this +is the expected value for this metric in Omniperf, hence why this metric +is named the “theoretical” bandwidth. + +To further illustrate this point we plot the relationship of the +theoretical bandwidth metric (**12.2.1**) as compared to the effective (or +achieved) bandwidth of this kernel, varying the number of work-items +launched from 1 to 256: + +.. figure:: ../data/profiling-by-example/ldsbandwidth.png + :align: center + :alt: Comparison of effective bandwidth versus the theoretical bandwidth + metric in Omniperf for our simple example. + :width: 800 + + Comparison of effective bandwidth versus the theoretical bandwidth + metric in Omniperf for our simple example. + +Here we see that the theoretical bandwidth metric follows a step-function. It +increases only when another wavefront issues an LDS instruction for up to 256 +bytes of data. Such increases are marked in the plot using dashed lines. In +contrast, the effective bandwidth increases linearly, by 4 bytes, with the +number of work-items in the kernel, N. + +.. _lds-bank-conflicts: + +Bank conflicts +-------------- + +Next we explore bank conflicts using a slight modification of our bandwidth +kernel: + +.. code-block:: cpp + + constexpr unsigned nbanks = 32; + __global__ void conflicts(int* out, int flag) { + constexpr unsigned nelements = nbanks * max_threads; + __shared__ int array[nelements]; + // each thread reads from the same bank + int index = threadIdx.x * nbanks; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; + } + +Here we: + +* Allocate an :doc:`LDS ` array of size + :math:`32*256*4{B}=32{KiB}` + +* Fake a write to the LDS using the ``flag`` + variable (always set to zero on the host) to avoid dead-code elimination + +* Read a single integer per work-item from index + ``threadIdx.x * nbanks`` of the LDS array + +* If the integer is equal to a + magic number (always false), write the value out to global memory to, + again, avoid dead-code elimination. + +On the host, we again repeatedly launch this kernel, varying the number +of work-items: + +.. code-block:: cpp + + void conflicts_demo(int N) { + for (int i = 1; i <= N; ++i) + conflicts<<<1,i>>>(nullptr, 0); + hipDeviceSynchronize(); + } + +Analyzing our first ``conflicts`` kernel (i.e., a single work-item), we +see: + +.. code-block:: shell + + $ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 256 -n per_kernel + <...> + -------------------------------------------------------------------------------- + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡ + │ 12.2.4 │ Index Accesses │ 2.00 │ 2.00 │ 2.00 │ Cycles per kernel │ + ├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤ + │ 12.2.6 │ Bank Conflict │ 0.00 │ 0.00 │ 0.00 │ Cycles per kernel │ + ╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛ + +In our :ref:`previous example `, we showed how a load +from a single work-item is considered to have a theoretical bandwidth of +256B. Recall, the :doc:`LDS ` can load up to :math:`128B` per +cycle (i.e, 32 banks x 4B / bank / cycle). Hence, we see that loading an 4B +integer spends two cycles accessing the LDS +(:math:`2\ {cycle} = (256B) / (128\ B/{cycle})`). + +Looking at the next ``conflicts`` dispatch (i.e., two work-items) yields: + +.. code-block:: shell + + $ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 257 -n per_kernel + <...> + -------------------------------------------------------------------------------- + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡ + │ 12.2.4 │ Index Accesses │ 3.00 │ 3.00 │ 3.00 │ Cycles per kernel │ + ├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤ + │ 12.2.6 │ Bank Conflict │ 1.00 │ 1.00 │ 1.00 │ Cycles per kernel │ + ╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛ + +Here we see a bank conflict! What happened? + +Recall that the index for each thread was calculated as: + +.. code-block:: cpp + + int index = threadIdx.x * nbanks; + +Or, precisely 32 elements, and each element is 4B wide (for a standard +integer). That is, each thread strides back to the same bank in the LDS, +such that each work-item we add to the dispatch results in another bank +conflict! + +Recalling our discussion of bank conflicts in our +:doc:`LDS ` description: + +A bank conflict occurs when two (or more) work-items in a wavefront +want to read, write, or atomically update different addresses that +map to the same bank in the same cycle. In this case, the conflict +detection hardware will determined a new schedule such that the +access is split into multiple cycles with no conflicts in any +single cycle. + +Here we see the conflict resolution hardware in action! Because we have +engineered our kernel to generate conflicts, we expect our bank conflict +metric to scale linearly with the number of work-items: + +.. figure:: ../data/profiling-by-example/ldsconflicts.png + :align: center + :alt: Comparison of LDS conflict cycles versus access cycles for our simple + example. + :width: 800 + + Comparison of LDS conflict cycles versus access cycles for our simple + example. + +Here we show the comparison of the Index Accesses (**12.2.4**), to the Bank +Conflicts (**12.2.6**) for the first 20 kernel invocations. We see that each grows +linearly, and there is a constant gap of 2 cycles between them (i.e., the first +access is never considered a conflict). + +Finally, we can use these two metrics to derive the Bank Conflict Rate (**12.1.4**). +Since within an Index Access we have 32 banks that may need to be updated, we +use: + +$$ +Bank\ Conflict\ Rate = 100 * ((Bank\ Conflicts / 32) / (Index\ Accesses - Bank\ Conflicts)) +$$ + +Plotting this, we see: + +.. figure:: ../data/profiling-by-example/ldsconflictrate.png + :align: center + :alt: LDS bank conflict rate example + :width: 800 + + LDS Bank Conflict rate for our simple example. + +The bank conflict rate linearly increases with the number of work-items +within a wavefront that are active, *approaching* 100%, but never quite +reaching it. diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/occupancy-limiters-example.rst b/projects/rocprofiler-compute/docs/tutorial/includes/occupancy-limiters-example.rst new file mode 100644 index 0000000000..3242545338 --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/occupancy-limiters-example.rst @@ -0,0 +1,456 @@ +.. _occupancy-example: + +Occupancy limiters example +========================== + +For this example, consider the +:dev-sample:`occupancy ` included with Omniperf. We will +investigate the use of the resource allocation panel in the +:ref:`Workgroup Manager `’s metrics section to determine occupancy +limiters. This code contains several kernels to explore how both various +kernel resources impact achieved occupancy, and how this is reported in +Omniperf. + +This example was compiled and run on a MI250 accelerator using ROCm +v5.6.0, and Omniperf v2.0.0: + +.. code-block:: shell + + $ hipcc -O3 occupancy.hip -o occupancy --save-temps + +We have again included the ``--save-temps`` flag to get the +corresponding assembly. + +Finally, we generate our Omniperf profile as: + +.. code-block:: shell + + $ omniperf profile -n occupancy --no-roof -- ./occupancy + +.. _occupancy-experiment-design: + +Design note +----------- + +For our occupancy test, we need to create a kernel that is resource +heavy, in various ways. For this purpose, we use the following (somewhat +funny-looking) kernel: + +.. code-block:: cpp + + constexpr int bound = 16; + __launch_bounds__(256) + __global__ void vgprbound(int N, double* ptr) { + double intermediates[bound]; + for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x; + double x = ptr[threadIdx.x]; + for (int i = 0; i < 100; ++i) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound])); + intermediates[i % bound] = x; + } + if (x == N) ptr[threadIdx.x] = x; + } + +Here we try to use as many :ref:`VGPRs ` as possible, to this end: + +* We create a small array of double precision floats, that we size to try + to fit into registers (i.e., ``bound``, this may need to be tuned + depending on the ROCm version). + +* We specify ``__launch_bounds___(256)`` + to increase the number of VPGRs available to the kernel (by limiting the + number of wavefronts that can be resident on a + :doc:`CU `). + +* Write a unique non-compile time constant to each element of the array. + +* Repeatedly permute and call relatively expensive math functions on our + array elements. + +* Keep the compiler from optimizing out any operations by faking a write to the + ``ptr`` based on a run-time conditional. + +This yields a total of 122 VGPRs, but it is expected this number will +depend on the exact ROCm/compiler version. + +.. code-block:: asm + + .size _Z9vgprboundiPd, .Lfunc_end1-_Z9vgprboundiPd + ; -- End function + .section .AMDGPU.csdata + ; Kernel info: + ; codeLenInByte = 4732 + ; NumSgprs: 68 + ; NumVgprs: 122 + ; NumAgprs: 0 + ; <...> + ; AccumOffset: 124 + +We will use various permutations of this kernel to limit occupancy, and +more importantly for the purposes of this example, demonstrate how this +is reported in Omniperf. + +.. _vgpr-occupancy: + +VGPR limited +------------ + +For our first test, we use the ``vgprbound`` kernel discussed in the +:ref:`design note `. After profiling, we run +the analyze step on this kernel: + +.. code-block:: shell + + $ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 --dispatch 1 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡ + │ 0 │ vgprbound(int, double*) │ 1.00 │ 923093822.50 │ 923093822.50 │ 923093822.50 │ 100.00 │ + ╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 2. System Speed-of-Light + 2.1 Speed-of-Light + ╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕ + │ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ + ╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡ + │ 2.1.15 │ Wavefront Occupancy │ 1661.24 │ Wavefronts │ 3328.00 │ 49.92 │ + ╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛ + + + -------------------------------------------------------------------------------- + 6. Workgroup Manager (SPI) + 6.2 Workgroup Manager - Resource Allocation + ╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ + │ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 0.64 │ 0.64 │ 0.64 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 24.94 │ 24.94 │ 24.94 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.2 │ Scheduler-Pipe Stall Rate │ 24.49 │ 24.49 │ 24.49 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.3 │ Scratch Stall Rate │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.4 │ Insufficient SIMD Waveslots │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.5 │ Insufficient SIMD VGPRs │ 94.90 │ 94.90 │ 94.90 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.6 │ Insufficient SIMD SGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.7 │ Insufficient CU LDS │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + + -------------------------------------------------------------------------------- + 7. Wavefront + 7.1 Wavefront Launch Stats + ╒═════════╤══════════╤════════╤════════╤════════╤═══════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪══════════╪════════╪════════╪════════╪═══════════╡ + │ 7.1.5 │ VGPRs │ 124.00 │ 124.00 │ 124.00 │ Registers │ + ├─────────┼──────────┼────────┼────────┼────────┼───────────┤ + │ 7.1.6 │ AGPRs │ 4.00 │ 4.00 │ 4.00 │ Registers │ + ├─────────┼──────────┼────────┼────────┼────────┼───────────┤ + │ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ + ╘═════════╧══════════╧════════╧════════╧════════╧═══════════╛ + +Here we see that the kernel indeed does use *around* (but not exactly) +122 VGPRs, with the difference due to granularity of VGPR allocations. +In addition, we see that we have allocated 4 “:ref:`AGPRs `”. We +note that on current CDNA2 accelerators, the ``AccumOffset`` field of +the assembly metadata: + +.. code-block:: asm + + ; AccumOffset: 124 + +denotes the divide between ``VGPRs`` and ``AGPRs``. + +Next, we examine our wavefront occupancy (**2.1.15**), and see that we are +reaching only :math:`\sim50\%` of peak occupancy. As a result, we see +that: + +- We are not scheduling workgroups :math:`\sim25\%` of + :ref:`total scheduler-pipe cycles ` (**6.2.1**); recall + from the discussion of the `workgroup manager `, 25% is the maximum. + +- The scheduler-pipe is stalled (**6.2.2**) from scheduling workgroups due to + resource constraints for the same :math:`\sim25\%` of the time. + +- And finally, :math:`\sim91\%` of those stalls are due to a lack of SIMDs + with the appropriate number of VGPRs available (6.2.5). + +That is, the reason we can’t reach full occupancy is due to our VGPR +usage, as expected! + +LDS limited +----------- + +To examine an LDS limited example, we must change our kernel slightly: + +.. code-block:: cpp + + constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double); + __launch_bounds__(256) + __global__ void ldsbound(int N, double* ptr) { + __shared__ double intermediates[fully_allocate_lds]; + for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x; + __syncthreads(); + double x = ptr[threadIdx.x]; + for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds])); + __syncthreads(); + intermediates[i % fully_allocate_lds] = x; + } + if (x == N) ptr[threadIdx.x] = x; + } + +Where we now: + +* Allocate an 64 KiB LDS array per workgroup, and + +* Use our allocated LDS array instead of a register array + +Analyzing this: + +.. code-block:: shell + + $ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 --dispatch 3 + <...> + -------------------------------------------------------------------------------- + 2. System Speed-of-Light + 2.1 Speed-of-Light + ╒═════════╤═════════════════════╤════════╤════════════╤═════════╤═══════════════╕ + │ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ + ╞═════════╪═════════════════════╪════════╪════════════╪═════════╪═══════════════╡ + │ 2.1.15 │ Wavefront Occupancy │ 415.52 │ Wavefronts │ 3328.00 │ 12.49 │ + ╘═════════╧═════════════════════╧════════╧════════════╧═════════╧═══════════════╛ + + + -------------------------------------------------------------------------------- + 6. Workgroup Manager (SPI) + 6.2 Workgroup Manager - Resource Allocation + ╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ + │ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 0.13 │ 0.13 │ 0.13 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 24.87 │ 24.87 │ 24.87 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.2 │ Scheduler-Pipe Stall Rate │ 24.84 │ 24.84 │ 24.84 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.3 │ Scratch Stall Rate │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.4 │ Insufficient SIMD Waveslots │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.5 │ Insufficient SIMD VGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.6 │ Insufficient SIMD SGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.7 │ Insufficient CU LDS │ 96.47 │ 96.47 │ 96.47 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + + -------------------------------------------------------------------------------- + 7. Wavefront + 7.1 Wavefront Launch Stats + ╒═════════╤════════════════╤══════════╤══════════╤══════════╤═══════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════╪══════════╪══════════╪══════════╪═══════════╡ + │ 7.1.5 │ VGPRs │ 96.00 │ 96.00 │ 96.00 │ Registers │ + ├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ + │ 7.1.6 │ AGPRs │ 0.00 │ 0.00 │ 0.00 │ Registers │ + ├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ + │ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ + ├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ + │ 7.1.8 │ LDS Allocation │ 65536.00 │ 65536.00 │ 65536.00 │ Bytes │ + ╘═════════╧════════════════╧══════════╧══════════╧══════════╧═══════════╛ + +We see that our VGPR allocation has gone down to 96 registers, but now +we see our 64KiB LDS allocation (**7.1.8**). In addition, we see a similar +non-schedule rate (**6.2.1**) and stall rate (**6.2.2**) as in our +:ref:`VGPR example `. However, our occupancy limiter has now +shifted from VGPRs (**6.2.5**) to LDS (**6.2.7**). + +We note that although we see the around the same scheduler/stall rates +(with our LDS limiter), our wave occupancy (**2.1.15**) is significantly +lower (:math:`\sim12\%`)! This is important to remember: the occupancy +limiter metrics in the resource allocation section tell you what the +limiter was, but *not* how much the occupancy was limited. These metrics +should always be analyzed in concert with the wavefront occupancy +metric! + +.. _sgpr-occupancy: + +SGPR limited +------------ + +Finally, we modify our kernel once more to make it limited by +`SGPRs `__: + +.. code-block:: cpp + + constexpr int sgprlim = 1; + __launch_bounds__(1024, 8) + __global__ void sgprbound(int N, double* ptr) { + double intermediates[sgprlim]; + for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i; + double x = ptr[0]; + #pragma unroll 1 + for (int i = 0; i < 100; ++i) { + x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim])); + intermediates[i % sgprlim] = x; + } + if (x == N) ptr[0] = x; + } + +The major changes here are to: - make as much as possible provably +uniform across the wave (notice the lack of ``threadIdx.x`` in the +``intermediates`` initialization and elsewhere), - addition of +``__launch_bounds__(1024, 8)``, which reduces our maximum VGPRs to 64 +(such that 8 waves can fit per SIMD), but causes some register spills +(i.e., :ref:`scratch ` usage), and - lower the ``bound`` (here we +use ``sgprlim``) of the array to reduce VGPR/Scratch usage. + +This results in the following assembly metadata for this kernel: + +.. code-block:: asm + + .size _Z9sgprboundiPd, .Lfunc_end3-_Z9sgprboundiPd + ; -- End function + .section .AMDGPU.csdata + ; Kernel info: + ; codeLenInByte = 4872 + ; NumSgprs: 76 + ; NumVgprs: 64 + ; NumAgprs: 0 + ; TotalNumVgprs: 64 + ; ScratchSize: 60 + ; <...> + ; AccumOffset: 64 + ; Occupancy: 8 + +Analyzing this workload yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 --dispatch 5 + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡ + │ 0 │ sgprbound(int, double*) │ 1.00 │ 782069812.00 │ 782069812.00 │ 782069812.00 │ 100.00 │ + ╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 2. System Speed-of-Light + 2.1 Speed-of-Light + ╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕ + │ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ + ╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡ + │ 2.1.15 │ Wavefront Occupancy │ 3291.76 │ Wavefronts │ 3328.00 │ 98.91 │ + ╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛ + + + -------------------------------------------------------------------------------- + 6. Workgroup Manager (SPI) + 6.2 Workgroup Manager - Resource Allocation + ╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ + │ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 7.72 │ 7.72 │ 7.72 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 15.17 │ 15.17 │ 15.17 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.2 │ Scheduler-Pipe Stall Rate │ 7.38 │ 7.38 │ 7.38 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.3 │ Scratch Stall Rate │ 39.76 │ 39.76 │ 39.76 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.4 │ Insufficient SIMD Waveslots │ 26.32 │ 26.32 │ 26.32 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.5 │ Insufficient SIMD VGPRs │ 26.32 │ 26.32 │ 26.32 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.6 │ Insufficient SIMD SGPRs │ 25.52 │ 25.52 │ 25.52 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.7 │ Insufficient CU LDS │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ + │ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ + ╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + + -------------------------------------------------------------------------------- + 7. Wavefront + 7.1 Wavefront Launch Stats + ╒═════════╤════════════════════╤═══════╤═══════╤═══════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════════════╪═══════╪═══════╪═══════╪════════════════╡ + │ 7.1.5 │ VGPRs │ 64.00 │ 64.00 │ 64.00 │ Registers │ + ├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ + │ 7.1.6 │ AGPRs │ 0.00 │ 0.00 │ 0.00 │ Registers │ + ├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ + │ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ + ├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ + │ 7.1.8 │ LDS Allocation │ 0.00 │ 0.00 │ 0.00 │ Bytes │ + ├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ + │ 7.1.9 │ Scratch Allocation │ 60.00 │ 60.00 │ 60.00 │ Bytes/workitem │ + ╘═════════╧════════════════════╧═══════╧═══════╧═══════╧════════════════╛ + +Here we see that our wavefront launch stats (**7.1**) have changed to +reflect the metadata seen in the ``--save-temps`` output. Of particular +interest, we see: + +* The SGPR allocation (**7.1.7**) is 80 registers, slightly more than the 76 + requested by the compiler due to allocation granularity, and + +* We have a :ref:`"scratch" `, that is, private memory, + allocation of 60 bytes per work-item. + +Analyzing the resource allocation block (**6.2**) we now see that for the +first time, the "Not-scheduled Rate (Workgroup Manager)" metric (**6.2.0**) +has become non-zero. This is because the workgroup manager is +responsible for management of scratch, which we see also contributes to +our occupancy limiters in the "Scratch Stall Rate" (**6.2.3**). Note that +the sum of the workgroup manager not-scheduled rate and the +scheduler-pipe non-scheduled rate is still :math:`\sim25\%`, as in our +previous examples. + +Next, we see that the scheduler-pipe stall rate (**6.2.2**), that is, how often +we could not schedule a workgroup to a CU, was only about +:math:`\sim8\%`. This hints that perhaps, our kernel is not +*particularly* occupancy limited by resources. Indeed, checking the +wave occupancy metric (**2.1.15**) shows that this kernel is reaching nearly +99% occupancy. + +Finally, we inspect the occupancy limiter metrics and see a roughly even +split between :ref:`waveslots ` (**6.2.4**), :ref:`VGPRs ` +(**6.2.5**), and :ref:`SGPRs ` (**6.2.6**) along with the scratch stalls +(**6.2.3**) previously mentioned. + +This is yet another reminder to view occupancy holistically. While these +metrics tell you why a workgroup cannot be scheduled, they do *not* tell +you what your occupancy was (consult wavefront occupancy) *nor* whether +increasing occupancy will be beneficial to performance. diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst b/projects/rocprofiler-compute/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst new file mode 100644 index 0000000000..785fc6ecf9 --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst @@ -0,0 +1,113 @@ +.. _valu-arith-instruction-mix-ex: + +VALU arithmetic instruction mix +=============================== + + For this example, consider the + :dev-sample:`instruction mix sample ` distributed as a part + of Omniperf. + +.. note:: + + The examples in the section are expected to work on all CDNA™ accelerators. + However, the actual experiment results in this section were collected on an + :ref:`MI2XX ` accelerator. + +.. _valu-experiment-design: + +Design note +----------- + +This code uses a number of inline assembly instructions to cleanly +identify the types of instructions being issued, as well as to avoid +optimization / dead-code elimination by the compiler. While inline +assembly is inherently not portable, this example is expected to work on +all GCN™ GPUs and CDNA accelerators. + +We reproduce a sample of the kernel as follows: + +.. code-block:: cpp + + // fp32: add, mul, transcendental and fma + float f1, f2; + asm volatile( + "v_add_f32_e32 %0, %1, %0\n" + "v_mul_f32_e32 %0, %1, %0\n" + "v_sqrt_f32 %0, %1\n" + "v_fma_f32 %0, %1, %0, %1\n" + : "=v"(f1) + : "v"(f2)); + +These instructions correspond to: + +* A 32-bit floating point addition, + +* a 32-bit floating point multiplication, + +* a 32-bit floating point square-root transcendental operation, and + +* a 32-bit floating point fused multiply-add operation. + +For more detail, refer to the `CDNA2 ISA +Guide `__. + +Instruction mix +^^^^^^^^^^^^^^^ + + This example was compiled and run on a MI250 accelerator using ROCm + v5.6.0, and Omniperf v2.0.0. + +.. code-block:: shell + + $ hipcc -O3 instmix.hip -o instmix + +Generate the profile for this example using the following command. + +.. code-block:: shell + + $ omniperf profile -n instmix --no-roof -- ./instmix + +Analyze the instruction mix section. + +.. code-block:: shell + + $ omniperf analyze -p workloads/instmix/mi200/ -b 10.2 + <...> + 10. Compute Units - Instruction Mix + 10.2 VALU Arithmetic Instr Mix + ╒═════════╤════════════╤═════════╤════════════════╕ + │ Index │ Metric │ Count │ Unit │ + ╞═════════╪════════════╪═════════╪════════════════╡ + │ 10.2.0 │ INT32 │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.1 │ INT64 │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.2 │ F16-ADD │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.3 │ F16-MUL │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.4 │ F16-FMA │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.5 │ F16-Trans │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.6 │ F32-ADD │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.7 │ F32-MUL │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.8 │ F32-FMA │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.9 │ F32-Trans │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.10 │ F64-ADD │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.11 │ F64-MUL │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.12 │ F64-FMA │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.13 │ F64-Trans │ 1.00 │ Instr per wave │ + ├─────────┼────────────┼─────────┼────────────────┤ + │ 10.2.14 │ Conversion │ 1.00 │ Instr per wave │ + ╘═════════╧════════════╧═════════╧════════════════╛ + +This shows that we have exactly one of each type of VALU arithmetic instruction +by construction. diff --git a/projects/rocprofiler-compute/docs/tutorial/includes/vector-memory-operation-counting.rst b/projects/rocprofiler-compute/docs/tutorial/includes/vector-memory-operation-counting.rst new file mode 100644 index 0000000000..e3dd0deb4a --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/includes/vector-memory-operation-counting.rst @@ -0,0 +1,698 @@ +.. _vmem-example: + +Vector memory operation counting +================================ + +.. _flat-memory-ex: + +Global / Generic (FLAT) +----------------------- + +For this example, consider the +:dev-sample:`vector memory sample ` distributed as a part of +Omniperf. This code launches many different versions of a simple +read/write/atomic-only kernels targeting various address spaces. For example, +below is our simple ``global_write`` kernel: + +.. code-block:: cpp + + // write to a global pointer + __global__ void global_write(int* ptr, int zero) { + ptr[threadIdx.x] = zero; + } + +.. note:: + + This example was compiled and run on an MI250 accelerator using ROCm + v5.6.0, and Omniperf v2.0.0. + +.. code-block:: shell-session + + $ hipcc -O3 --save-temps vmem.hip -o vmem + +We have also chosen to include the ``--save-temps`` flag to save the +compiler temporary files, such as the generated CDNA assembly code, for +inspection. + +Finally, we generate our ``omniperf profile`` as follows. + +.. code-block:: shell-session + + $ omniperf profile -n vmem --no-roof -- ./vmem + +.. _flat-experiment-design: + +Design note +^^^^^^^^^^^ + +This section explains some of the more peculiar lines of code in the +example, for example, the use of compiler built-ins and explicit address space +casting, and so forth. + +.. code-block:: cpp + + // write to a generic pointer + typedef int __attribute__((address_space(0)))* generic_ptr; + + __attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; } + + __global__ void generic_write(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_store((generic_ptr)generic, zero); + } + +One of the aims of this example is to demonstrate the use of the +:llvm-docs:`"generic" FLAT ` address space. This +address space is typically used when the compiler cannot statically prove where +the backing memory is located. + +To try to *force* the compiler to use this address space, we applied +``__attribute__((noinline))`` to the ``generic_store`` function to have the +compiler treat it as a function call (that is, on the other side of which, the +address space may not be known). However, in a trivial example such as this, the +compiler may choose to specialize the ``generic_store`` function to the two +address spaces that might provably be used from our translation unit, that is, +:ref:`"local" (or, LDS) ` and :ref:`"global" `. +Hence, we forcibly cast the address space to +:ref:`"generic" (or, FLAT) ` to avoid this compiler +optimization. + +.. warning:: + + While convenient for this example, this sort of explicit address space + casting can lead to strange compilation errors, and in the worst case, + incorrect results. As a result, use is discouraged in production code. + +For more details on address spaces, refer to +:ref:`memory-spaces`. + +Global write +^^^^^^^^^^^^ + +First, we demonstrate our simple ``global_write`` kernel: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 1 -b 10.3 15.1.4 15.1.5 15.1.6 15.1.7 15.1.8 15.1.9 15.1.10 15.1.11 -n per_kernel + <...> + -------------------------------------------------------------------------------- + 0. Top Stat + ╒════╤═════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪═════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ global_write(int*, int) [clone .kd] │ 1.00 │ 2400.00 │ 2400.00 │ 2400.00 │ 100.00 │ + ╘════╧═════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 15. Address Processing Unit and Data Return Path (TA/TD) + 15.1 Address Processing Unit + ╒═════════╤═════════════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 15.1.4 │ Total Instructions │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.5 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.6 │ Global/Generic Read Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.7 │ Global/Generic Write Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.8 │ Global/Generic Atomic Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.9 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.10 │ Spill/Stack Read Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 15.1.11 │ Spill/Stack Write Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═════════════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + +Here, we have presented both the information in the VMEM Instruction Mix +table (**10.3**) and the Address Processing Unit (**15.1**). We note that this +data is expected to be identical, and hence we omit table 15.1 in our +subsequent examples. + +In addition, as expected, we see a single Global/Generic Write +instruction (**10.3.2**, **15.1.7**). Inspecting the generated assembly, we get: + +.. code-block:: asm + + .protected _Z12global_writePii ; -- Begin function _Z12global_writePii + .globl _Z12global_writePii + .p2align 8 + .type _Z12global_writePii,@function + _Z12global_writePii: ; @_Z12global_writePii + ; %bb.0: + s_load_dword s2, s[4:5], 0x8 + s_load_dwordx2 s[0:1], s[4:5], 0x0 + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s2 + global_store_dword v0, v1, s[0:1] + s_endpgm + .section .rodata,#alloc + .p2align 6, 0x0 + .amdhsa_kernel _Z12global_writePii + +Notice that this corresponds to an instance of a ``global_store_dword`` +operation. + +.. note:: + + The assembly in these experiments were generated for an + :ref:`MI2XX ` accelerator using ROCm 5.6.0, and may change + depending on ROCm versions and the targeted hardware architecture. + +.. _generic-write-ex: + +Generic write to LDS +^^^^^^^^^^^^^^^^^^^^ + +Next, we examine a generic write. As discussed +:ref:`previously `, our ``generic_write`` kernel uses an +address space cast to *force* the compiler to choose our desired address +space, regardless of other optimizations that may be possible. + +Also note that the ``filter`` parameter passed in as a kernel argument (see +:dev-sample:`example ` and +:ref:`design note `) is set to zero on the host, such +that we always write to the :doc:`local ` (LDS) +memory allocation ``lds``. + +Examining this kernel in the VMEM Instruction Mix table yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 10.3 -n per_kernel + <...> + 0. Top Stat + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ generic_write(int*, int, int) [clone .kd │ 1.00 │ 2880.00 │ 2880.00 │ 2880.00 │ 100.00 │ + │ │ ] │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + +As expected we see a single generic write (**10.3.2**). In the assembly +generated for this kernel (in particular, we care about the +``generic_store`` function), we see that this corresponds to a +``flat_store_dword`` instruction: + +.. code-block:: asm + + .type _Z13generic_storePii,@function + _Z13generic_storePii: ; @_Z13generic_storePii + ; %bb.0: + s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + flat_store_dword v[0:1], v2 + s_waitcnt vmcnt(0) lgkmcnt(0) + s_setpc_b64 s[30:31] + .Lfunc_end0: + +In addition, we note that we can observe the destination of this request +by looking at the LDS Instructions metric (**12.2.0**) -- which indicates one LDS +access. + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 12.2.0 -n per_kernel + <...> + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 12.2.0 │ LDS Instrs │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ + +.. note:: + + Exercise for the reader: if this access had been targeted at global memory + (for instance, by changing value of ``filter``), where should we look for the + memory traffic? Hint: see the :ref:`generic read ` example. + +.. _global-read-ex: + +Global read +^^^^^^^^^^^ + +Next, we examine a simple global read operation: + +.. code-block:: cpp + + __global__ void global_read(int* ptr, int zero) { + int x = ptr[threadIdx.x]; + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } + } + +Here we observe a now familiar pattern: + +- Read a value in from global memory. + +- Have a write hidden behind a conditional that is impossible for + the compiler to statically eliminate, but is identically false. In this + case, our ``main()`` function initializes the data in ``ptr`` to zero. + +Running Omniperf on this kernel yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 3 -b 10.3 -n per_kernel + <...> + 0. Top Stat + ╒════╤════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ global_read(int*, int) [clone .kd] │ 1.00 │ 4480.00 │ 4480.00 │ 4480.00 │ 100.00 │ + ╘════╧════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + +Here we see a single global/generic instruction (**10.3.0**) which, as +expected, is a read (**10.3.1**). + +.. _generic-read-ex: + +Generic read from global memory +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For our generic read example, we choose to change our target for the +generic read to be global memory: + +.. code-block:: cpp + + __global__ void generic_read(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + if (static_cast(filter - 1) == zero) { + lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional + } + int* generic; + if (static_cast(threadIdx.x) > filter - 1) { + generic = &ptr[threadIdx.x]; + } else { + generic = &lds[threadIdx.x]; + abort(); + } + int x = generic_load((generic_ptr)generic); + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } + } + +In addition to our usual ``if (condition_that_wont_happen)`` guard +around the write operation, there is an additional conditional around +the initialization of the ``lds`` buffer. We note that it’s typically +required to write to this buffer to prevent the compiler from +eliminating the local memory branch entirely due to undefined behavior +(use of an uninitialized value). However, to report *only* our global +memory read, we again hide this initialization behind an identically +false conditional (both ``zero`` and ``filter`` are set to zero in the +kernel launch). Note that this is a *different* conditional from our +pointer assignment (to avoid combination of the two). + +Running Omniperf on this kernel reports: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 4 -b 10.3 12.2.0 16.3.10 -n per_kernel + <...> + 0. Top Stat + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ generic_read(int*, int, int) [clone .kd] │ 1.00 │ 2240.00 │ 2240.00 │ 2240.00 │ 100.00 │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 12.2.0 │ LDS Instrs │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 16. Vector L1 Data Cache + 16.3 L1D Cache Accesses + ╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡ + │ 16.3.10 │ L1-L2 Read │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ + ╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛ + +Here we observe: + +- A single global/generic read operation (**10.3.1**), which + +- Is not an LDS instruction (**12.2**), as seen in the + :ref:`generic write ` example, but is instead + +- An L1-L2 read operation (**16.3.10**) + +That is, we have successfully targeted our generic read at global +memory. Inspecting the assembly shows this corresponds to a +``flat_load_dword`` instruction. + +.. _global-atomic-ex: + +Global atomic +^^^^^^^^^^^^^ + +Our global atomic kernel simply atomically adds a (non-compile-time) zero value +to a pointer. + +.. code-block:: cpp + + __global__ void global_atomic(int* ptr, int zero) { + atomicAdd(ptr, zero); + } + + +Running Omniperf on this kernel yields: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 5 -b 10.3 16.3.12 -n per_kernel + <...> + 0. Top Stat + ╒════╤══════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ global_atomic(int*, int) [clone .kd] │ 1.00 │ 4640.00 │ 4640.00 │ 4640.00 │ 100.00 │ + ╘════╧══════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + -------------------------------------------------------------------------------- + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 16. Vector L1 Data Cache + 16.3 L1D Cache Accesses + ╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡ + │ 16.3.12 │ L1-L2 Atomic │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ + ╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛ + +Here we see a single global/generic atomic instruction (**10.3.3**), which +corresponds to an L1-L2 atomic request (**16.3.12**). + +.. _generic-mixed-atomic-ex: + +Generic, mixed atomic +^^^^^^^^^^^^^^^^^^^^^ + +In our final global/generic example, we look at a case where our generic +operation targets both LDS and global memory: + +.. code-block:: cpp + + __global__ void generic_atomic(int* ptr, int filter, int zero) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_atomic((generic_ptr)generic, zero); + } + +This assigns every other work-item to atomically update global memory or +local memory. + +Running this kernel through Omniperf shows: + +.. code-block:: shell-session + + $ omniperf analyze -p workloads/vmem/mi200/ --dispatch 6 -b 10.3 12.2.0 16.3.12 -n per_kernel + <...> + 0. Top Stat + ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ + │ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ + ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ + │ 0 │ generic_atomic(int*, int, int) [clone .k │ 1.00 │ 3360.00 │ 3360.00 │ 3360.00 │ 100.00 │ + │ │ d] │ │ │ │ │ │ + ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 12. Local Data Share (LDS) + 12.2 LDS Stats + ╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 12.2.0 │ LDS Instrs │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 16. Vector L1 Data Cache + 16.3 L1D Cache Accesses + ╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡ + │ 16.3.12 │ L1-L2 Atomic │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ + ╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛ + +That is, we see: + +- A single generic atomic instruction (**10.3.3**) that maps to both + +- An LDS instruction (**12.2.0**), and + +- An L1-L2 atomic request (**16.3**) + +We have demonstrated the ability of the generic address space to +*dynamically* target different backing memory. + +.. _spill-scratch: + +Spill/Scratch (BUFFER) +---------------------- + +Next we examine the use of "Spill/Scratch" memory. On current CDNA +accelerators such as the :ref:`MI2XX `, this is implemented using +the :ref:`private ` memory space, which maps to +:llvm-docs:`"scratch" memory ` in AMDGPU hardware +terminology. This type of memory can be accessed via different instructions +depending on the specific architecture targeted. However, current CDNA +accelerators such as the :ref:`MI2XX ` use so called ``buffer`` +instructions to access private memory in a simple (and typically) coalesced +manner. See +:mi200-isa-pdf:`Sec. 9.1, "Vector Memory Buffer Instructions" of the CDNA2 ISA guide <>` +for further reading on this instruction type. + +We develop a `simple +kernel `__ +that uses stack memory: + +.. code-block:: cpp + + #include + __global__ void knl(int* out, int filter) { + int x[1024]; + x[filter] = 0; + if (threadIdx.x < filter) + out[threadIdx.x] = x[threadIdx.x]; + } + +Our strategy here is to: + +* Create a large stack buffer (that cannot reasonably fit into registers) - Write to a compile-time unknown + location on the stack, and then + +* Behind the typical compile-time unknown ``if(condition_that_wont_happen)`` + +* Read from a different, compile-time unknown, location on the stack and write + to global memory to prevent the compiler from optimizing it out. + +This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and +Omniperf v2.0.0. + +.. code-block:: shell-session + + $ hipcc -O3 stack.hip -o stack.hip + +And profiled using Omniperf: + +.. code-block:: shell-session + + $ omniperf profile -n stack --no-roof -- ./stack + <...> + $ omniperf analyze -p workloads/stack/mi200/ -b 10.3 16.3.11 -n per_kernel + <...> + 10. Compute Units - Instruction Mix + 10.3 VMEM Instr Mix + ╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ + │ 10.3.0 │ Global/Generic Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.4 │ Spill/Stack Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.6 │ Spill/Stack Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ + ├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ + │ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ + ╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + + -------------------------------------------------------------------------------- + 16. Vector L1 Data Cache + 16.3 L1D Cache Accesses + ╒═════════╤═════════════╤═══════╤═══════╤═══════╤════════════════╕ + │ Index │ Metric │ Avg │ Min │ Max │ Unit │ + ╞═════════╪═════════════╪═══════╪═══════╪═══════╪════════════════╡ + │ 16.3.11 │ L1-L2 Write │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ + ╘═════════╧═════════════╧═══════╧═══════╧═══════╧════════════════╛ + +Here we see a single write to the stack (**10.3.6**), which corresponds to +an L1-L2 write request (**16.3.11**), that is, the stack is backed by global +memory and travels through the same memory hierarchy. diff --git a/projects/rocprofiler-compute/docs/tutorial/learning-resources.rst b/projects/rocprofiler-compute/docs/tutorial/learning-resources.rst new file mode 100644 index 0000000000..931f1f7f1d --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/learning-resources.rst @@ -0,0 +1,22 @@ +.. meta:: + :description: Omniperf external training resources + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD, + training, examples + +****************** +Learning resources +****************** + +This section is a catalog of external resources and third-party content that +can help you learn Omniperf. Some areas of the following content might be +outdated. + +Introduction to Omniperf + :fab:`youtube` `AMD profiling workshop (Pawsey Supercomputing Research Centre) `_ + +Omniperf example exercises + ``__ + +AMD Instinct™ tuning guides + :doc:`rocm:how-to/tuning-guides/mi300x/workload` + diff --git a/projects/rocprofiler-compute/docs/tutorial/profiling-by-example.rst b/projects/rocprofiler-compute/docs/tutorial/profiling-by-example.rst new file mode 100644 index 0000000000..8a9c85c03b --- /dev/null +++ b/projects/rocprofiler-compute/docs/tutorial/profiling-by-example.rst @@ -0,0 +1,23 @@ +.. meta:: + :description: Omniperf: Profiling by example + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD + +******************** +Profiling by example +******************** + +The following examples refer to sample :doc:`HIP ` code located in +:fab:`github` :dev-sample:`ROCm/omniperf/blob/dev/sample <>` and distributed +as part of Omniperf. + +.. include:: ./includes/valu-arithmetic-instruction-mix.rst + +.. include:: ./includes/infinity-fabric-transactions.rst + +.. include:: ./includes/vector-memory-operation-counting.rst + +.. include:: ./includes/instructions-per-cycle-and-utilizations.rst + +.. include:: ./includes/lds-examples.rst + +.. include:: ./includes/occupancy-limiters-example.rst diff --git a/projects/rocprofiler-compute/docs/what-is-omniperf.rst b/projects/rocprofiler-compute/docs/what-is-omniperf.rst new file mode 100644 index 0000000000..473be896ff --- /dev/null +++ b/projects/rocprofiler-compute/docs/what-is-omniperf.rst @@ -0,0 +1,129 @@ +.. meta:: + :description: What is Omniperf? + :keywords: Omniperf, ROCm, profiler, tool, Instinct, accelerator, AMD + +***************** +What is Omniperf? +***************** + +Omniperf is a kernel-level profiling tool for machine learning and high +performance computing (HPC) workloads running on AMD Instinct™ accelerators. + +AMD Instinct MI-series accelerators are data center-class GPUs designed for +compute and have some graphics capabilities disabled or removed. Omniperf +primarily targets use with +:doc:`accelerators in the MI300, MI200, and MI100 families `. +Development is in progress to support Radeon™ (RDNA) GPUs. + +Omniperf is built on top of :doc:`ROCProfiler ` to +monitor hardware performance counters. + +.. _high-level-design: + +High-level design of Omniperf +============================= + +The architecture of Omniperf consists of three major components shown in the +following diagram. + +Core Omniperf profiler +---------------------- + +Acquires raw performance counters via application replay using ``rocprof``. +Counters are stored in a comma-separated-values format for further +:doc:`analysis `. It runs a set of accelerator-specific +micro-benchmarks to acquire hierarchical roofline data. The roofline model is +not available on accelerators pre-MI200. + +Grafana server for Omniperf +--------------------------- + +* **Grafana database import**: All raw performance counters are imported into + a :ref:`backend MongoDB database ` to support + analysis and visualization in the Grafana GUI. Compatibility with + previously generated data using older Omniperf versions is not guaranteed. + +* **Grafana analysis dashboard GUI**: The + :doc:`Grafana dashboard ` retrieves the raw + counters information from the backend database. It displays the relevant + performance metrics and visualization. + +Omniperf standalone GUI analyzer +-------------------------------- + +Omniperf provides a :doc:`standalone GUI ` to +enable basic performance analysis without the need to import data into a +database instance. Find setup instructions in :doc:`install/grafana-setup` + +.. image:: data/install/omniperf_server_vs_client_install.png + :align: center + :alt: Architectural design of Omniperf + :width: 800 + +Omniperf features +================= + +Omniperf offers comprehensive profiling based on all available hardware counters +for the target accelerator. It delivers advanced performance analysis features, +such as system Speed-of-Light (SOL) and hardware block-level SOL evaluations. +Additionally, Omniperf provides in-depth memory chart analysis, roofline +analysis, baseline comparisons, and more, ensuring a thorough understanding of +system performance. + +Omniperf supports analysis through both the :doc:`command line ` or a +:doc:`GUI `. The following list describes Omniperf's features at a +high level. + +* :doc:`Support for AMD Instinct MI300, MI200, and MI100 accelerators ` + +* :doc:`Standalone GUI analyzer ` + +* :doc:`GUI analyzer via Grafana and MongoDB ` + + * :ref:`System Info panel ` + + * :ref:`Kernel Statistic panel ` + + * :ref:`System Speed-of-Light panel ` + + * :ref:`Memory Chart Analysis panel ` + + * :ref:`Roofline Analysis panel ` + (*Supported on MI200 only, Ubuntu 20.04, SLES 15 SP3 or RHEL8*) + + * :ref:`Command Processor (CP) panel ` + + * :ref:`Workgroup Manager (SPI) panel ` + + * :ref:`Wavefront Launch panel ` + + * :ref:`Compute Unit - Instruction Mix panel ` + + * :ref:`Compute Unit - Pipeline panel ` + + * :ref:`Local Data Share (LDS) panel ` + + * :ref:`Instruction Cache panel ` + + * :ref:`Scalar L1D Cache panel ` + + * :ref:`L1 Address Processing Unit, or, Texture Addresser (TA) ` + and :ref:`L1 Backend Data Processing Unit, or, Texture Data (TD) ` panels + + * :ref:`Vector L1D Cache panel ` + + * :ref:`L2 Cache panel ` + + * :ref:`L2 Cache (per-channel) panel ` + +* :ref:`Filtering ` to reduce profiling time + + * Filtering by dispatch + + * Filter by kernel + + * Filtering by GPU ID + +* :ref:`Baseline comparisons ` + +* :ref:`Multiple normalizations `