From c47e5838f1b9f27b1f44feb284e84f733fb2120e Mon Sep 17 00:00:00 2001 From: "Elwazir, Ammar" Date: Tue, 13 May 2025 16:18:23 -0500 Subject: [PATCH] [rocprofv3-benchmark] SDK and rocprofv3 Benchmarking Suite (#157) * Adding Benchmarking Stg1 * config fix * reset * add jpeg and decode traces in iteration * address comments benchmark config files. * address comments. * address comments. * address comments: revert cntrl ctx. * address comments: revert csv output. * resolve merge conflits. * format. * build fix. * fix hip runtime api traces. * loop cb services. * format. * bug fix. * Fix operator> - public C++ comparison operator * Update configuration options - support selected regions (--selected-regions) - support writing output config json (--output-config) - update serialization data * rocprofv3 tool library misc updates - lambda for starting context - support for writing config json * Tool library updates - Finished support for all benchmarking modes - Added build spec support to config json * Fix ROCPROFILER_SOVERSION - this value should not be multiplied by 10,000 * Minor tweak to rocprofv3 * Benchmarking scripts * formatting * Fix duplicate include * Add reproducible-dispatch-count test app - used in benchmarking * registration logging - report number of registered contexts and active contexts after client initialization * Serialize environment in rocprofv3 output config * ROCPROFILER_BUILD_BENCHMARK CMake option * Update benchmark SQL schema - hash_id is text - add md5sum to benchmarked_app - remove app_id from benchmarked_sdk - add sdk_id to benchmark_config - separate hip_trace into hip_runtime_trace and hip_compiler_trace - use INT instead of INTEGER for MySQL compatibility - add count column in benchmark_statistics - allow std_dev to be NULL in benchmark_statistics * Update rocprofv3-benchmark.py - use md5 instead of python hash (which includes random seed) - use args.mysql_database - compute md5sum of executable - fix insert_benchmark_config - marker trace fixes - memory allocation fixes - split hip_trace into hip_{runtime,compiler}_trace - remove app_id from benchmarked_sdk - support warmup runs - count field in benchmark_statistics * Support launcher and environment in YAML * Update reproducible-dispatch-count.cpp - support mode which doesn't use hip event timing * Misc rocprofv3-benchmark.py updates - fix some MySQL support - remove some unnecessary logging * support mysql db. * Format. * Updated SQL input files - moved benchmark_schema.sql to benchmark_table.sql - added benchmark_views.sql - uses {{metric}} syntax for variable substitution * cmake formatting * update rocprofv3-benchmark.py - benchmark config labels - overhead views * Encode rocprofv3-benchmark PID in rocprofv3 and timem output files * Minor tweak to benchmark_views.sql - include count - reorder fields for readability * split statements and use IS if values is NONE. * use backtick instead of double quotes and add IS before NOT NULL.: * Adding Mandelbrot Benchmark App * Adding Dockerfile example * Update dockerfile * Update dockerfile * [SDK] rocprofiler_query_external_correlation_id_request_kind_name * Execution-profile benchmark mode * Execution profile SQL support * Rename mandlebrot folder + misc clang-tidy * [rocprofv3-benchmark] Execution profile support * Update installation * add work dir when setting git revision, useful when building outside src. * Set FULL_VERSION_STRING and ROCPROFILER_SDK_GIT_REVISION - when benchmark folder is top-level * Remove unused python packages from requirements.txt * Use ldd/pyelftools to include linked libs for md5sum - also add --filter-benchmark and --filter-rocprofv3 options - support labeling the rocprofv3 options - use more argparse groups - more generic application of filters - support variable substitution in environment, e.g. PATH=/some/path:$PATH * Environment improvements - improve reproducibility when env set via input file vs. shell - support "environment-ignore" to remove environment variables * Misc formatting * Misc. fix * use backticks for defining new columns name * Support shuffling the order of benchmark modes/rocprofv3 args * Address review comments * Update Dockerfile - rename to Dockerfile - reduce to one layer * Support docker build arg BRANCH --------- Co-authored-by: Ammar ELWazir Co-authored-by: Kandula, Venkateshwar reddy Co-authored-by: Venkateshwar Reddy Kandula Co-authored-by: Madsen, Jonathan Co-authored-by: Jonathan R. Madsen [ROCm/rocprofiler-sdk commit: 6f17da7adeae302a2facff2b28114573202705a1] --- .../.github/workflows/formatting.yml | 4 +- projects/rocprofiler-sdk/CMakeLists.txt | 5 + projects/rocprofiler-sdk/benchmark/.gitignore | 10 + .../rocprofiler-sdk/benchmark/CMakeLists.txt | 101 ++ projects/rocprofiler-sdk/benchmark/README.md | 21 + .../benchmark/cmake/timem.cmake | 53 + .../rocprofiler-sdk/benchmark/example.yml | 50 + .../rocprofiler-sdk/benchmark/minimal.yaml | 32 + .../benchmark/requirements.txt | 3 + .../benchmark/source/CMakeLists.txt | 7 + .../benchmark/source/bin/CMakeLists.txt | 24 + .../source/bin/mandelbrot/CMakeLists.txt | 44 + .../source/bin/mandelbrot/mandelbrot.cpp | 938 +++++++++++ .../benchmark/source/bin/mandelbrot/utils.cpp | 324 ++++ .../benchmark/source/bin/mandelbrot/utils.hpp | 733 ++++++++ .../source/bin/rocprofv3-benchmark.py | 1487 +++++++++++++++++ .../benchmark/source/lib/CMakeLists.txt | 11 + .../benchmark/source/share/CMakeLists.txt | 5 + .../share/rocprofiler-sdk/CMakeLists.txt | 12 + .../rocprofiler-sdk/benchmark_tables.sql | 137 ++ .../share/rocprofiler-sdk/benchmark_views.sql | 75 + .../cmake/rocprofiler_config_packaging.cmake | 9 +- .../cmake/rocprofiler_formatting.cmake | 2 +- .../cmake/rocprofiler_options.cmake | 1 + projects/rocprofiler-sdk/docker/Dockerfile | 15 + .../rocprofiler-sdk/source/bin/rocprofv3.py | 47 + .../source/docs/how-to/samples.rst | 2 +- .../source/docs/install/installation.rst | 2 +- .../include/rocprofiler-sdk/cxx/operators.hpp | 2 +- .../rocprofiler-sdk/external_correlation.h | 23 + .../include/rocprofiler-sdk/version.h.in | 2 +- .../source/lib/output/output_config.hpp | 1 + .../lib/rocprofiler-sdk-tool/CMakeLists.txt | 2 +- .../lib/rocprofiler-sdk-tool/config.cpp | 17 + .../lib/rocprofiler-sdk-tool/config.hpp | 32 +- .../execution_profile.hpp | 104 ++ .../source/lib/rocprofiler-sdk-tool/tool.cpp | 740 +++++--- .../rocprofiler-sdk/external_correlation.cpp | 67 +- .../rocprofiler-sdk/external_correlation.hpp | 1 + .../lib/rocprofiler-sdk/registration.cpp | 27 + .../rocprofiler-sdk/tests/bin/CMakeLists.txt | 1 + .../CMakeLists.txt | 59 + .../reproducible-dispatch-count.cpp | 254 +++ 43 files changed, 5206 insertions(+), 280 deletions(-) create mode 100644 projects/rocprofiler-sdk/benchmark/.gitignore create mode 100644 projects/rocprofiler-sdk/benchmark/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/README.md create mode 100644 projects/rocprofiler-sdk/benchmark/cmake/timem.cmake create mode 100644 projects/rocprofiler-sdk/benchmark/example.yml create mode 100644 projects/rocprofiler-sdk/benchmark/minimal.yaml create mode 100644 projects/rocprofiler-sdk/benchmark/requirements.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/bin/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/mandelbrot.cpp create mode 100644 projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.cpp create mode 100644 projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.hpp create mode 100755 projects/rocprofiler-sdk/benchmark/source/bin/rocprofv3-benchmark.py create mode 100644 projects/rocprofiler-sdk/benchmark/source/lib/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/share/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_tables.sql create mode 100644 projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_views.sql create mode 100644 projects/rocprofiler-sdk/docker/Dockerfile create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/execution_profile.hpp create mode 100644 projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/reproducible-dispatch-count.cpp diff --git a/projects/rocprofiler-sdk/.github/workflows/formatting.yml b/projects/rocprofiler-sdk/.github/workflows/formatting.yml index 4b038dd405..a721226e2e 100644 --- a/projects/rocprofiler-sdk/.github/workflows/formatting.yml +++ b/projects/rocprofiler-sdk/.github/workflows/formatting.yml @@ -70,7 +70,7 @@ jobs: - name: Run clang-format run: | set +e - FILES=$(find samples source tests -type f | egrep '\.(h|hpp|hh|c|cc|cpp)(|\.in)$') + FILES=$(find samples source tests benchmark -type f | egrep '\.(h|hpp|hh|c|cc|cpp)(|\.in)$') FORMAT_OUT=$(clang-format-11 -i ${FILES}) if [ $(git diff | wc -l) -ne 0 ]; then echo -e "\nError! Code not formatted. Run clang-format (version 11)...\n" @@ -130,7 +130,7 @@ jobs: shell: bash run: | OUTFILE=missing_newline.txt - for i in $(find source/lib source/include tests samples cmake -type f | egrep -v '\.bin$'); do VAL=$(tail -c 1 ${i}); if [ -n "${VAL}" ]; then echo "- ${i}" >> ${OUTFILE}; fi; done + for i in $(find source tests samples benchmark docker cmake -type f | egrep -v '\.(bin|png|csv)$|source/docs/_(build|doxygen)'); do VAL=$(tail -c 1 ${i}); if [ -n "${VAL}" ]; then echo "- ${i}" >> ${OUTFILE}; fi; done if [[ -f ${OUTFILE} && $(cat ${OUTFILE} | wc -l) -gt 0 ]]; then echo -e "\nError! Source code missing new line at end of file...\n" echo -e "\nFiles:\n" diff --git a/projects/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/CMakeLists.txt index dccafa1623..53f9e247ad 100644 --- a/projects/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/CMakeLists.txt @@ -54,6 +54,7 @@ if(Git_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse HEAD + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE ROCPROFILER_SDK_GIT_REVISION OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) else() @@ -119,6 +120,10 @@ if(ROCPROFILER_BUILD_SAMPLES) add_subdirectory(samples) endif() +if(ROCPROFILER_BUILD_BENCHMARK) + add_subdirectory(benchmark) +endif() + include(rocprofiler_config_packaging) rocprofiler_print_features() diff --git a/projects/rocprofiler-sdk/benchmark/.gitignore b/projects/rocprofiler-sdk/benchmark/.gitignore new file mode 100644 index 0000000000..427c18c0fc --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/.gitignore @@ -0,0 +1,10 @@ +# Exclude databases +*.db +*.db-journal +/.rocprofv3/** + +# Build directories +/compile_commands.json +/build* +/.cache +*.vscode diff --git a/projects/rocprofiler-sdk/benchmark/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/CMakeLists.txt new file mode 100644 index 0000000000..7b7abf59eb --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/CMakeLists.txt @@ -0,0 +1,101 @@ +# +# Integration tests +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(COMMAND rocprofiler_deactivate_clang_tidy) + rocprofiler_deactivate_clang_tidy() +endif() + +project(rocprofiler-sdk-benchmark LANGUAGES C CXX) + +# +# project options +# +option(ROCPROFILER_BENCHMARK_INSTALL_TIMEM "Install timem" ON) + +# +# cmake overrides +# +set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "benchmark") +set(ROCPROFILER_SDK_BENCHMARK_SOURCE_DIR "${PROJECT_SOURCE_DIR}") +set(ROCPROFILER_SDK_BENCHMARK_BINARY_DIR "${PROJECT_BINARY_DIR}") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE + "Release" + CACHE STRING "" FORCE) +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) + +enable_testing() +include(CTest) + +include(GNUInstallDirs) +# always use lib instead of lib64 +set(CMAKE_INSTALL_LIBDIR "lib") +set(CMAKE_INSTALL_LIB64DIR "lib64") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_DATAROOT_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}") + +# define the library output directory +if(PROJECT_IS_TOP_LEVEL) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../VERSION" FULL_VERSION_STRING LIMIT_COUNT 1) + string(REGEX REPLACE "(\n|\r)" "" FULL_VERSION_STRING "${FULL_VERSION_STRING}") + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)(.*)" "\\1.\\2.\\3" + ROCPROFILER_SDK_VERSION "${FULL_VERSION_STRING}") + + find_package(Git) + + if(Git_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/../.git") + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse HEAD + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/.. + OUTPUT_VARIABLE ROCPROFILER_SDK_GIT_REVISION + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) + else() + set(ROCPROFILER_SDK_GIT_REVISION "") + endif() +else() + set(ROCPROFILER_BENCHMARK_INSTALL_PREFIX + ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-sdk/benchmark + CACHE PATH "relative install prefix for rocprofiler-sdk benchmark") + set(_GNUInstallDirs_VARIABLES + BINDIR + SBINDIR + LIBEXECDIR + SYSCONFDIR + SHAREDSTATEDIR + LOCALSTATEDIR + RUNSTATEDIR + LIBDIR + LIB64DIR # locally defined + INCLUDEDIR + OLDINCLUDEDIR + DATAROOTDIR + DATADIR + INFODIR + LOCALEDIR + MANDIR + DOCDIR) + + foreach(_INSTALL_DIR ${_GNUInstallDirs_VARIABLES}) + set(CMAKE_INSTALL_${_INSTALL_DIR} + ${ROCPROFILER_BENCHMARK_INSTALL_PREFIX}/${CMAKE_INSTALL_${_INSTALL_DIR}}) + endforeach() +endif() + +# include the timem executable +include(timem) + +add_subdirectory(source) diff --git a/projects/rocprofiler-sdk/benchmark/README.md b/projects/rocprofiler-sdk/benchmark/README.md new file mode 100644 index 0000000000..8012c499e8 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/README.md @@ -0,0 +1,21 @@ +# Benchmark Suite + +## Generate Data + +From the current directory: + +```shell +cmake -B build-benchmark . +cd build-benchmark +export PATH=${PWD}/bin:${PATH} +rocprofv3-benchmark -i ./example.yml -n 2 +``` + +```shell +sqlite3 benchmark.db +``` + +```sql +SELECT * FROM benchmark_metrics; +SELECT * FROM benchmark_statistics; +``` diff --git a/projects/rocprofiler-sdk/benchmark/cmake/timem.cmake b/projects/rocprofiler-sdk/benchmark/cmake/timem.cmake new file mode 100644 index 0000000000..35a6fcb456 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/cmake/timem.cmake @@ -0,0 +1,53 @@ +# +# timem installation +# + +if(NOT ROCPROFILER_BENCHMARK_INSTALL_TIMEM) + find_program( + TIMEM_EXECUTABLE + NAMES timem + HINTS ${PROJECT_BINARY_DIR} + PATHS ${PROJECT_BINARY_DIR} + PATH_SUFFIXES bin) +endif() + +if(NOT TIMEM_EXECUTABLE OR NOT EXISTS "${TIMEM_EXECUTABLE}") + set(TIMEM_INSTALLER + ${CMAKE_CURRENT_BINARY_DIR}/installer/timemory-timem-1.0.0-Linux.sh) + find_program(SHELL_EXECUTABLE NAMES sh bash REQUIRED) + + file( + DOWNLOAD + https://github.com/ROCm/timemory/releases/download/timemory-timem%2Fv0.0.4/timemory-timem-1.0.0-Linux.sh + ${TIMEM_INSTALLER} + EXPECTED_MD5 63da7df7996a86d6d9ce312276c2f014 + INACTIVITY_TIMEOUT 30 + TIMEOUT 300 + SHOW_PROGRESS) + + execute_process( + COMMAND ${SHELL_EXECUTABLE} ${TIMEM_INSTALLER} --prefix=${PROJECT_BINARY_DIR} + --exclude-subdir --skip-license + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + RESULT_VARIABLE _RET + OUTPUT_VARIABLE _OUT + ERROR_VARIABLE _ERR + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE) + + if(NOT EXISTS ${PROJECT_BINARY_DIR}/bin/timem OR NOT _RET EQUAL 0) + message( + FATAL_ERROR + "timem installation failed with exit code ${_RET}.\nSTDOUT:\n\t${_OUT}\nSTDERR:\n\t${_ERR}" + ) + endif() +endif() + +find_program( + TIMEM_EXECUTABLE + NAMES timem REQUIRED + HINTS ${PROJECT_BINARY_DIR} + PATHS ${PROJECT_BINARY_DIR} + PATH_SUFFIXES bin) + +add_executable(rocprofiler-sdk::timem IMPORTED) +set_property(TARGET rocprofiler-sdk::timem PROPERTY IMPORTED_LOCATION ${TIMEM_EXECUTABLE}) diff --git a/projects/rocprofiler-sdk/benchmark/example.yml b/projects/rocprofiler-sdk/benchmark/example.yml new file mode 100644 index 0000000000..3fe2c964cc --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/example.yml @@ -0,0 +1,50 @@ +defaults: + rocprofv3: + # keys such as "runtime", "kernel", etc. below can be used with --filter-rocprofv3 option + runtime: [--runtime-trace] + sys: [--sys-trace] + kernel: [--kernel-trace] + hip: [--hip-trace] + hsa: [--hsa-trace] + memcpy: [--memory-copy-trace] + malloc: [--memory-allocation-trace] + scratch: [--scratch-memory-trace] + counters: [--pmc, SQ_WAVES] + benchmark: + - baseline + - disabled-sdk-contexts + - sdk-buffer-overhead + - sdk-callback-overhead + - tool-runtime-overhead + group: + - examples + environment: + ROCR_VISIBLE_DEVICES: "0,2" + HIP_VISIBLE_DEVICES: "0,2" + +jobs: + # explicitly specifies name, group, and command. inherits default rocprofv3 commands and benchmark modes + - name: hip-in-libraries + group: [examples, multigpu, multistream] + command: [hip-in-libraries] + + # explicitly specifies name, group, command, and rocprofv3. inherits default benchmark modes + - name: transpose + group: [examples, multithreaded, multigpu, multistream] + command: [transpose, 4, 500, 10] + launcher: [mpirun, -n, 1] + rocprofv3: + - [-r, --pmc, SQ_WAVES] + - [--kernel-trace] + - [--hip-trace] + + # explicitly specifies name, command, rocprofv3, and benchmark. inherits default group(s) + - name: hip-graph + command: [hip-graph, 8, 500] + rocprofv3: + - [--kernel-trace] + - [--hip-trace] + - [--hsa-trace] + benchmark: + - baseline + - disabled-sdk-contexts diff --git a/projects/rocprofiler-sdk/benchmark/minimal.yaml b/projects/rocprofiler-sdk/benchmark/minimal.yaml new file mode 100644 index 0000000000..d9166ec5c1 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/minimal.yaml @@ -0,0 +1,32 @@ +defaults: + rocprofv3: + # keys are not provided so --filter-rocprofv3 option isn't supported for this YAML input + - [--runtime-trace] + - [--sys-trace] + - [--kernel-trace] + - [--hip-trace] + - [--hsa-trace] + - [--memory-copy-trace] + - [--memory-allocation-trace] + - [--scratch-memory-trace] + - [--pmc, SQ_WAVES] + benchmark: + # these names can be used with --filter-benchmark option + - baseline + - disabled-sdk-contexts + - sdk-buffer-overhead + - sdk-callback-overhead + - tool-runtime-overhead + group: + - examples + +jobs: + # explicitly specifies name, command, rocprofv3, and benchmark. inherits default group(s) + - name: hip-graph + command: [hip-graph, 8, 500] + rocprofv3: + - [--kernel-trace] + - [--hip-trace] + benchmark: + - baseline + - disabled-sdk-contexts diff --git a/projects/rocprofiler-sdk/benchmark/requirements.txt b/projects/rocprofiler-sdk/benchmark/requirements.txt new file mode 100644 index 0000000000..0d73ad44af --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/requirements.txt @@ -0,0 +1,3 @@ +# these are optional, not required +mysql-connector-python +pyelftools diff --git a/projects/rocprofiler-sdk/benchmark/source/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/CMakeLists.txt new file mode 100644 index 0000000000..cbce7e16b4 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/CMakeLists.txt @@ -0,0 +1,7 @@ +# +# +# + +add_subdirectory(lib) +add_subdirectory(bin) +add_subdirectory(share) diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/bin/CMakeLists.txt new file mode 100644 index 0000000000..bcb09c2130 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# +# + +# Adding main rocprofv3 +configure_file(rocprofv3-benchmark.py + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/rocprofv3-benchmark @ONLY) + +install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/rocprofv3-benchmark + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE) + +# downloaded timem +install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/timem + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE + OPTIONAL) + +# Adding Benchmark Workloads +add_subdirectory(mandelbrot) diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/CMakeLists.txt new file mode 100644 index 0000000000..5f12744847 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-benchmark-bin-mandelbrot LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(mandelbrot.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(utils.cpp PROPERTIES LANGUAGE HIP) + +add_executable(mandelbrot) +target_sources(mandelbrot PRIVATE mandelbrot.cpp utils.cpp) +target_compile_options(mandelbrot PRIVATE -W -Wall -Wextra -Wpedantic -Werror + -ffp-contract=fast) +target_include_directories(mandelbrot PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +install( + TARGETS mandelbrot + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT benchmark) diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/mandelbrot.cpp b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/mandelbrot.cpp new file mode 100644 index 0000000000..e77947dc52 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/mandelbrot.cpp @@ -0,0 +1,938 @@ +/* + Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + double x; + double y; + double width; +} coordRec; + +coordRec coords[] = { + {0.0, 0.0, 4.0}, // Whole set + {0.0, 0.0, 0.00001}, // All black + {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +template +__global__ void +float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter) +{ + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float) (xPos + xStep * i); + float y0 = (float) (yPos + yStep * j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for(iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) + { + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + } + + out[tid] = iter; +}; + +template +__global__ void +float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter) +{ + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float) (xPos + xStep * (float) i); + float y0 = (float) (yPos + yStep * (float) j); + + float x = x0; + float y = y0; + +#define FAST + uint iter = 0; + float tmp; + int stay; + uint ccount = 0; + stay = (x * x + y * y) <= 4.0; + float savx = x; + float savy = y; +#ifdef FAST + for(iter = 0; (iter < maxIter); iter += 16) + { +#else + for(iter = 0; stay && (iter < maxIter); iter += 16) + { +#endif + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + stay = (x * x + y * y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay * 16; +#ifdef FAST + if(!stay) break; +#endif + } + // Handle remainder + if(!stay) + { + iter = 16; + do + { + x = savx; + y = savy; + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while(stay && iter); + } + + out[tid] = (uint) ccount; +}; + +template +__global__ void +double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter) +{ + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + double x0 = (double) (xPos + xStep * i); + double y0 = (double) (yPos + yStep * j); + + double x = x0; + double y = y0; + + uint iter = 0; + double tmp; + for(iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) + { + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + } + out[tid] = iter; +}; + +template +__global__ void +double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter) +{ + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + + int i = tid % width; + int j = tid / width; + double x0 = (double) (xPos + xStep * (double) i); + double y0 = (double) (yPos + yStep * (double) j); + + double x = x0; + double y = y0; + +#define FAST + uint iter = 0; + double tmp; + int stay; + uint ccount = 0; + stay = (x * x + y * y) <= 4.0; + double savx = x; + double savy = y; +#ifdef FAST + for(iter = 0; (iter < maxIter); iter += 16) +#else + for(iter = 0; stay && (iter < maxIter); iter += 16) +#endif + { + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + // Two iterations + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); + + stay = (x * x + y * y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay * 16; +#ifdef FAST + if(!stay) break; +#endif + } + // Handle remainder + if(!stay) + { + iter = 16; + do + { + x = savx; + y = savy; + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while(stay && iter); + } + out[tid] = (uint) ccount; +}; + +// Commenting it out as it is not used anywhere in the code +// static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; + +// Expected results for each kernel run at each coord +unsigned long long expectedIters[] = { + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull, + 203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull, + 203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull, + 203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + +class hipPerfMandelBrot +{ +public: + hipPerfMandelBrot(); + ~hipPerfMandelBrot(); + + void setNumKernels(unsigned int num) { numKernels = num; } + + unsigned int getNumKernels() const { return numKernels; } + + void setNumStreams(unsigned int num) { numStreams = num; } + unsigned int getNumStreams() const { return numStreams; } + + void open(int deviceID); + void run(unsigned int testCase, unsigned int deviceId); + void printResults(void); + + // array of funtion pointers + typedef void (hipPerfMandelBrot::*funPtr)(uint* out, + uint width, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt); + + // Wrappers + void float_mad(uint* out, + uint width, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt); + + void float_mandel_unroll(uint* out, + uint width, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt); + + void double_mad(uint* out, + uint width, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt); + + void double_mandel_unroll(uint* out, + uint width, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt); + + hipStream_t streams[2]; + +private: + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); + + unsigned int numKernels; + unsigned int numStreams; + + std::map> results; + unsigned int width_; + unsigned int bufSize; + unsigned int maxIter; + unsigned int coordIdx; + volatile unsigned long long totalIters = 0; + int numCUs; + static const unsigned int numLoops = 10; +}; + +hipPerfMandelBrot::hipPerfMandelBrot() = default; + +hipPerfMandelBrot::~hipPerfMandelBrot() = default; + +void +hipPerfMandelBrot::open(int deviceId) +{ + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if(nGpu < 1) + { + failed("No GPU!"); + } + + HIPCHECK(hipSetDevice(deviceId)); + hipDeviceProp_t props = {}; + HIPCHECK(hipGetDeviceProperties(&props, deviceId)); + std::cout << "info: running on bus " + << "0x" << props.pciBusID << " " << props.name << " with " + << props.multiProcessorCount << " CUs" + << " and device id: " << deviceId << std::endl; + + numCUs = props.multiProcessorCount; +} + +void +hipPerfMandelBrot::printResults() +{ + // int numkernels = getNumKernels(); + int numStreams = getNumStreams(); + + std::cout << "\n" + << "Measured perf for kernels in GFLOPS on " << numStreams << " streams (s)" + << std::endl; + + std::map>::iterator itr; + for(itr = results.begin(); itr != results.end(); itr++) + { + std::cout << "\n" << std::setw(20) << itr->first << " "; + for(auto i : results[itr->first]) + { + std::cout << std::setw(10) << i << " "; + } + } + results.clear(); + + std::cout << std::endl; +} + +// Wrappers for the kernel launches +void +hipPerfMandelBrot::float_mad(uint* out, + uint /* width */, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt) +{ + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mad_kernel, + dim3(blocks), + dim3(threads_per_block), + 0, + streams[kernelCnt % streamCnt], + out, + width_, + xPos, + yPos, + xStep, + yStep, + maxIter); +} + +void +hipPerfMandelBrot::float_mandel_unroll(uint* out, + uint /* width */, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt) +{ + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, + dim3(blocks), + dim3(threads_per_block), + 0, + streams[kernelCnt % streamCnt], + out, + width_, + xPos, + yPos, + xStep, + yStep, + maxIter); +} + +void +hipPerfMandelBrot::double_mad(uint* out, + uint /* width */, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt) +{ + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(double_mad_kernel, + dim3(blocks), + dim3(threads_per_block), + 0, + streams[kernelCnt % streamCnt], + out, + width_, + xPos, + yPos, + xStep, + yStep, + maxIter); +} + +void +hipPerfMandelBrot::double_mandel_unroll(uint* out, + uint /* width */, + float xPos, + float yPos, + float xStep, + float yStep, + uint maxIter, + hipStream_t* streams, + int blocks, + int threads_per_block, + int kernelCnt) +{ + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, + dim3(blocks), + dim3(threads_per_block), + 0, + streams[kernelCnt % streamCnt], + out, + width_, + xPos, + yPos, + xStep, + yStep, + maxIter); +} + +void +hipPerfMandelBrot::run(unsigned int testCase, unsigned int /* deviceId */) +{ + unsigned int numStreams = getNumStreams(); + coordIdx = testCase % numCoords; + + funPtr p[] = {&hipPerfMandelBrot::float_mad, + &hipPerfMandelBrot::float_mandel_unroll, + &hipPerfMandelBrot::double_mad, + &hipPerfMandelBrot::double_mandel_unroll}; + + // Maximum iteration count + maxIter = 32768; + + // Variable-length arrays (VLAs) are not part of the C++ standard, using std::vector instead + // uint * hPtr[numKernels]; + // uint * dPtr[numKernels]; + std::vector hPtr(numKernels); + std::vector dPtr(numKernels); + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + for(uint i = 0; i < numStreams; i++) + { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + // Allocate memory on the host and device + for(uint i = 0; i < numKernels; i++) + { + HIPCHECK(hipHostMalloc((void**) &hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint**) &dPtr[i], bufSize)) + } + + // Prepare kernel launch parameters + int threads = (bufSize / sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); + + // float xStep = (float)(coords[coordIdx].width / (double)width_); + // float yStep = (float)(-coords[coordIdx].width / (double)width_); + // float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + // float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory asynchronously and concurrently from host to device + for(uint i = 0; i < numKernels; i++) + { + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + } + + // Synchronize to make sure all the copies are completed + HIPCHECK(hipStreamSynchronize(nullptr)); + + int kernelIdx; + if(testCase == 0 || testCase == 5 || testCase == 10) + { + kernelIdx = 0; + } + + else if(testCase == 1 || testCase == 6 || testCase == 11) + { + kernelIdx = 1; + } + else if(testCase == 2 || testCase == 7 || testCase == 12) + { + kernelIdx = 2; + } + else if(testCase == 3 || testCase == 8 || testCase == 13) + { + kernelIdx = 3; + } + + double totalTime = 0.0; + + for(unsigned int k = 0; k < numLoops; k++) + { + if((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 || + testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) + { + float xStep = (float) (coords[coordIdx].width / (double) width_); + float yStep = (float) (-coords[coordIdx].width / (double) width_); + float xPos = (float) (coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float) (coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for(uint i = 0; i < numKernels; i++) + { + (this->*p[kernelIdx])(dPtr[i], + width_, + xPos, + yPos, + xStep, + yStep, + maxIter, + streams, + blocks, + threads_per_block, + i); + } + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } + + else + { + double xStep = coords[coordIdx].width / (double) width_; + double yStep = -coords[coordIdx].width / (double) width_; + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for(uint i = 0; i < numKernels; i++) + { + (this->*p[kernelIdx])(dPtr[i], + width_, + xPos, + yPos, + xStep, + yStep, + maxIter, + streams, + blocks, + threads_per_block, + i); + } + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } + } + + // Copy data back from device to the host + for(uint i = 0; i < numKernels; i++) + { + HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); + } + + for(uint i = 0; i < numKernels; i++) + { + checkData(hPtr[i]); + + int j = 0; + while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) + { + j++; + } + + if(j == 30) + { + std::cout << "Incorrect iteration count detected. "; + } + } + + // Compute GFLOPS. There are 7 FLOPs per iteration + double perf = ((double) (totalIters * numKernels) * 7 * (double) (1e-09)) / + (totalTime / (double) numLoops); + + std::vector kernelName = {"float", "float_unroll", "double", "double_unroll"}; + + // Print results except for Warm-up kernel + if(testCase != 100) + { + results[kernelName[testCase % 4]].push_back(perf); + } + + for(uint i = 0; i < numStreams; i++) + { + HIPCHECK(hipStreamDestroy(streams[i])); + } + + // Free host and device memory + for(uint i = 0; i < numKernels; i++) + { + HIPCHECK(hipHostFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } +} + +void +hipPerfMandelBrot::setData(void* ptr, unsigned int value) +{ + unsigned int* ptr2 = (unsigned int*) ptr; + for(unsigned int i = 0; i < width_ * width_; i++) + { + ptr2[i] = value; + } +} + +void +hipPerfMandelBrot::checkData(uint* ptr) +{ + totalIters = 0; + for(unsigned int i = 0; i < width_ * width_; i++) + { + totalIters += ptr[i]; + } +} + +int +main(int argc, char* argv[]) +{ + // Default values for kernels and streams + unsigned int numStreamsWarmup = 1, numKernelsWarmup = 1; + unsigned int numStreamsSync = 1, numKernelsSync = 1; + unsigned int numStreamsAsync = 2, numKernelsAsync = 2; + + // Check for help arguments + if(argc > 1 && (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help" || + std::string(argv[1]) == "help")) + { + std::cout << "Usage: " << argv[0] + << " [--warmup :] [--sync :] " + "[--async :]" + << std::endl; + std::cout << "Example: " << argv[0] << " --warmup 1:1 --sync 2:4 --async 3:6" << std::endl; + std::exit(EXIT_SUCCESS); // Exit with success status + } + + // Parse command-line arguments + for(int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + if(arg == "--warmup" && i + 1 < argc) + { + std::string value = argv[++i]; + std::stringstream ss(value); + char delimiter; + ss >> numStreamsWarmup >> delimiter >> numKernelsWarmup; + if(delimiter != ':' || ss.fail()) + { + std::cerr << "Invalid format for --warmup. Expected :." + << std::endl; + std::exit(EXIT_FAILURE); + } + } + else if(arg == "--sync" && i + 1 < argc) + { + std::string value = argv[++i]; + std::stringstream ss(value); + char delimiter; + ss >> numStreamsSync >> delimiter >> numKernelsSync; + if(delimiter != ':' || ss.fail()) + { + std::cerr << "Invalid format for --sync. Expected :." + << std::endl; + std::exit(EXIT_FAILURE); + } + } + else if(arg == "--async" && i + 1 < argc) + { + std::string value = argv[++i]; + std::stringstream ss(value); + char delimiter; + ss >> numStreamsAsync >> delimiter >> numKernelsAsync; + if(delimiter != ':' || ss.fail()) + { + std::cerr << "Invalid format for --async. Expected :." + << std::endl; + std::exit(EXIT_FAILURE); + } + } + else + { + std::cerr << "Unknown argument: " << arg << std::endl; + std::exit(EXIT_FAILURE); + } + } + + int deviceCount = 0; + HIPCHECK(hipGetDeviceCount(&deviceCount)); // Get the number of devices + +#pragma omp parallel for + for(int deviceId = 0; deviceId < deviceCount; deviceId++) + { + hipPerfMandelBrot mandelbrotCompute; + mandelbrotCompute.open(deviceId); + + for(unsigned int testCase = 0; testCase < 3; testCase++) + { + switch(testCase) + { + case 0: + { + // Warmup-kernel - default stream executes serially + mandelbrotCompute.setNumStreams(numStreamsWarmup); + mandelbrotCompute.setNumKernels(numKernelsWarmup); + mandelbrotCompute.run(100 /*Random number*/, deviceId); + break; + } + + case 1: + { + // run all - sync + int i = 0; + do + { + mandelbrotCompute.setNumStreams(numStreamsSync); + mandelbrotCompute.setNumKernels(numKernelsSync); + mandelbrotCompute.run(i, deviceId); + i++; + } while(i < 12); + mandelbrotCompute.printResults(); + + break; + } + + case 2: + { + // run all - async + int i = 0; + do + { + mandelbrotCompute.setNumStreams(numStreamsAsync); + mandelbrotCompute.setNumKernels(numKernelsAsync); + mandelbrotCompute.run(i, deviceId); + i++; + } while(i < 12); + mandelbrotCompute.printResults(); + + break; + } + + default: + { + break; + } + } + } + } + + passed(); +} diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.cpp b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.cpp new file mode 100644 index 0000000000..8674ccee58 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.cpp @@ -0,0 +1,324 @@ +/* +Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "utils.hpp" + +#include +#ifdef __linux__ +# include +#elif defined(_WIN32) +# include +#endif + +// standard global variables that can be set on command line +size_t N = 4 * 1024 * 1024; +char memsetval = 0x42; +int memsetD32val = 0xDEADBEEF; +short memsetD16val = 0xDEAD; +char memsetD8val = 0xDE; +int iterations = 1; +unsigned blocksPerCU = 6; // to hide latency +unsigned threadsPerBlock = 256; +int textureFilterMode = 0; // 0: hipFilterModePoint; 1: hipFilterModeLinear +int p_gpuDevice = 0; +unsigned p_verbose = 0; +int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/ +int debug_test = 0; +#ifdef _WIN64 +const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES="; +const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES="; +const char* PATH_SEPERATOR_STR = "\\"; +const char* NULL_DEVICE = "NUL:"; +#else +const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES"; +const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES"; +const char* PATH_SEPERATOR_STR = "/"; +const char* NULL_DEVICE = "/dev/null"; +#endif + +#ifdef _WIN64 +// Windows does not have rand_r, use srand and rand instead. +int +rand_r(unsigned int* s) +{ + srand(*s); + return rand(); +} +#endif + +// Get Free Memory from the system +static size_t +getMemoryAmount() +{ +#if __linux__ + struct sysinfo info; + int _ = sysinfo(&info); + return info.freeram / (1024 * 1024); // MB +#elif defined(_WIN32) + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + GlobalMemoryStatusEx(&statex); + return (statex.ullAvailPhys / (1024 * 1024)); // MB +#endif +} + +size_t +getHostThreadCount(const size_t memPerThread, const size_t maxThreads) +{ + if(memPerThread == 0) return 0; + auto memAmount = getMemoryAmount(); + const auto processor_count = std::thread::hardware_concurrency(); + if(processor_count == 0 || memAmount == 0) return 0; + size_t thread_count = 0; + if((processor_count * memPerThread) < memAmount) + thread_count = processor_count; + else + thread_count = reinterpret_cast(memAmount / memPerThread); + if(maxThreads > 0) + { + return (thread_count > maxThreads) ? maxThreads : thread_count; + } + return thread_count; +} + +// Function to determine if the device is of gfx11 architecture +bool +IsGfx11() +{ +#if defined(__HIP_PLATFORM_NVIDIA__) + return false; +#elif defined(__HIP_PLATFORM_AMD__) + int device = -1; + hipDeviceProp_t props{}; + HIPCHECK(hipGetDevice(&device)); + HIPCHECK(hipGetDeviceProperties(&props, device)); + + // Get GCN Arch Name and compare to check if it is gfx11 + std::string arch = std::string(props.gcnArchName); + auto pos = arch.find(":"); + if(pos != std::string::npos) arch = arch.substr(0, pos); + + if(arch.size() >= 5) arch = arch.substr(0, 5); + + return (arch == std::string("gfx11")) ? true : false; +#else + std::cout << "Have to be either Nvidia or AMD platform, asserting" << std::endl; + assert(false); +#endif +} + +namespace HipTest +{ +double +elapsed_time(long long startTimeUs, long long stopTimeUs) +{ + return ((double) (stopTimeUs - startTimeUs)) / ((double) (1000)); +} + +int +parseSize(const char* str, size_t* output) +{ + char* next; + *output = strtoull(str, &next, 0); + int l = strlen(str); + if(l) + { + char c = str[l - 1]; // last char. + if((c == 'k') || (c == 'K')) + { + *output *= 1024; + } + if((c == 'm') || (c == 'M')) + { + *output *= (1024 * 1024); + } + if((c == 'g') || (c == 'G')) + { + *output *= (1024 * 1024 * 1024); + } + } + return 1; +} + +int +parseUInt(const char* str, unsigned int* output) +{ + char* next; + *output = strtoul(str, &next, 0); + return !strlen(next); +} + +int +parseInt(const char* str, int* output) +{ + char* next; + *output = strtol(str, &next, 0); + return !strlen(next); +} + +int +parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg) +{ + int extraArgs = 1; + for(int i = 1; i < argc; i++) + { + const char* arg = argv[i]; + + if(!strcmp(arg, " ")) + { + // skip NULL args. + } + else if(!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) + { + if(++i >= argc || !HipTest::parseSize(argv[i], &N)) + { + failed("Bad N size argument"); + } + } + else if(!strcmp(arg, "--threadsPerBlock")) + { + if(++i >= argc || !HipTest::parseUInt(argv[i], &threadsPerBlock)) + { + failed("Bad threadsPerBlock argument"); + } + } + else if(!strcmp(arg, "--blocksPerCU")) + { + if(++i >= argc || !HipTest::parseUInt(argv[i], &blocksPerCU)) + { + failed("Bad blocksPerCU argument"); + } + } + else if(!strcmp(arg, "--memsetval")) + { + int ex; + if(++i >= argc || !HipTest::parseInt(argv[i], &ex)) + { + failed("Bad memsetval argument"); + } + memsetval = ex; + } + else if(!strcmp(arg, "--memsetD32val")) + { + int ex; + if(++i >= argc || !HipTest::parseInt(argv[i], &ex)) + { + failed("Bad memsetD32val argument"); + } + memsetD32val = ex; + } + else if(!strcmp(arg, "--memsetD16val")) + { + int ex; + if(++i >= argc || !HipTest::parseInt(argv[i], &ex)) + { + failed("Bad memsetD16val argument"); + } + memsetD16val = ex; + } + else if(!strcmp(arg, "--memsetD8val")) + { + int ex; + if(++i >= argc || !HipTest::parseInt(argv[i], &ex)) + { + failed("Bad memsetD8val argument"); + } + memsetD8val = ex; + } + else if(!strcmp(arg, "--textureFilterMode")) + { + int mode; + if(++i >= argc || !HipTest::parseInt(argv[i], &mode)) + { + failed("Bad textureFilterMode argument"); + } + textureFilterMode = mode; + } + else if(!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) + { + if(++i >= argc || !HipTest::parseInt(argv[i], &iterations)) + { + failed("Bad iterations argument"); + } + } + else if(!strcmp(arg, "--gpu") || (!strcmp(arg, "-gpuDevice")) || (!strcmp(arg, "-g"))) + { + if(++i >= argc || !HipTest::parseInt(argv[i], &p_gpuDevice)) + { + failed("Bad gpuDevice argument"); + } + } + else if(!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) + { + if(++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) + { + failed("Bad verbose argument"); + } + } + else if(!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) + { + if(++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) + { + failed("Bad tests argument"); + } + } + else if(!strcmp(arg, "--debug") || (!strcmp(arg, "-d"))) + { + if(++i >= argc || !HipTest::parseInt(argv[i], &debug_test)) + { + failed("Bad tests argument"); + } + } + else + { + if(failOnUndefinedArg) + { + failed("Bad argument '%s'", arg); + } + else + { + argv[extraArgs++] = argv[i]; + } + } + }; + + return extraArgs; +} + +unsigned +setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) +{ + int device; + HIPCHECK(hipGetDevice(&device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + + unsigned blocks = props.multiProcessorCount * blocksPerCU; + if(blocks * threadsPerBlock > N) + { + blocks = (N + threadsPerBlock - 1) / threadsPerBlock; + } + + return blocks; +} + +} // namespace HipTest diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.hpp b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.hpp new file mode 100644 index 0000000000..541629bcae --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/mandelbrot/utils.hpp @@ -0,0 +1,733 @@ +/* +Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * File is intended to C and CPP compliant hence any CPP specic changes + * should be added into CPP section + * + */ +#pragma once + +#ifdef __cplusplus +# include +# include +# if __CUDACC__ +# include +# else +# include +# endif +#endif + +// ************************ GCC section ************************** +#include + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" + +#define HC __attribute__((hc)) + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +// HIP Skip Return code set at cmake +#define HIP_SKIP_RETURN_CODE 127 +#define HIP_ENABLE_SKIP_TESTS 0 + +// Recommended thresholds for Tests +#define MAX_THREADS 100 + +inline bool +hip_skip_tests_enabled() +{ + return HIP_ENABLE_SKIP_TESTS; +} + +inline int +hip_skip_retcode() +{ + // HIP Skip Return code set at cmake + return HIP_SKIP_RETURN_CODE; +} + +// This must be called in the end of main() to indicate test passed with success. +// If it's called somewhere else, compiling issues or unexpected result will arise. +#define passed() \ + printf("%sPASSED!%s\n", KGRN, KNRM); \ + return 0; + +// The real "assert" would have written to stderr. But it is +// sufficient to just fflush here without getting pedantic. This also +// ensures that we don't lose any earlier writes to stdout. +#define failed(...) \ + printf("%serror: ", KRED); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + printf("error: TEST FAILED\n%s", KNRM); \ + fflush(NULL); \ + abort(); + +#define warn(...) \ + printf("%swarn: ", KYEL); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + printf("warn: TEST WARNING\n%s", KNRM); + +#define HIP_PRINT_STATUS(status) \ + std::cout << hipGetErrorName(status) << " at line: " << __LINE__ << std::endl; + +#define HIPCHECK(error) \ + { \ + hipError_t localError = error; \ + if((localError != hipSuccess) && (localError != hipErrorPeerAccessAlreadyEnabled)) \ + { \ + printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \ + KRED, \ + hipGetErrorString(localError), \ + localError, \ + #error, \ + __FILE__, \ + __LINE__, \ + KNRM); \ + failed("API returned error code."); \ + } \ + } + +#define HIPASSERT(condition) \ + if(!(condition)) \ + { \ + failed("%sassertion %s at %s:%d%s \n", KRED, #condition, __FILE__, __LINE__, KNRM); \ + } + +#define HIPCHECK_API(API_CALL, EXPECTED_ERROR) \ + { \ + hipError_t _e = (API_CALL); \ + if(_e != (EXPECTED_ERROR)) \ + { \ + failed("%sAPI '%s' returned %d(%s) but test expected %d(%s) at %s:%d%s \n", \ + KRED, \ + #API_CALL, \ + _e, \ + hipGetErrorName(_e), \ + EXPECTED_ERROR, \ + hipGetErrorName(EXPECTED_ERROR), \ + __FILE__, \ + __LINE__, \ + KNRM); \ + } \ + } + +#define HIPCHECK_RETURN_ONFAIL(func) \ + do \ + { \ + hipError_t herror = (func); \ + if(herror != hipSuccess) \ + { \ + return herror; \ + } \ + } while(0); + +#ifdef _WIN64 +# include +# define aligned_alloc(x, y) _aligned_malloc(y, x) +# define aligned_free(x) _aligned_free(x) +# define popen(x, y) _popen(x, y) +# define pclose(x) _pclose(x) +# define setenv(x, y, z) _putenv_s(x, y) +# define unsetenv _putenv +# define fileno(x) _fileno(x) +# define dup(x) _dup(x) +# define dup2(x, y) _dup2(x, y) +# define pipe(x, y, z) _pipe(x, y, z) +# define sleep(x) _sleep(x) +#else +# define aligned_free(x) free(x) +#endif + +// standard command-line variables: +extern size_t N; +extern char memsetval; +extern int memsetD32val; +extern short memsetD16val; +extern char memsetD8val; +extern int iterations; +extern unsigned blocksPerCU; +extern unsigned threadsPerBlock; +extern int textureFilterMode; +extern int p_gpuDevice; +extern unsigned p_verbose; +extern int p_tests; +extern int debug_test; +extern const char* HIP_VISIBLE_DEVICES_STR; +extern const char* CUDA_VISIBLE_DEVICES_STR; +extern const char* PATH_SEPERATOR_STR; +extern const char* NULL_DEVICE; + +// ********************* CPP section ********************* +#ifdef __cplusplus + +# ifdef __HIP_PLATFORM_HCC +# define TYPENAME(T) typeid(T).name() +# else +# define TYPENAME(T) "?" +# endif + +# ifdef _WIN64 +int +rand_r(unsigned int* s); +# endif + +// Get Optimal Thread count size +size_t +getHostThreadCount(const size_t memPerThread = 200 /* MB */, const size_t maxThreads = 0); + +namespace HipTest +{ +// Returns the current system time in microseconds +inline long long +get_time() +{ +# if __CUDACC__ + struct timeval tv; + gettimeofday(&tv, 0); + return (tv.tv_sec * 1000000) + tv.tv_usec; +# else + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::microseconds(1); +# endif +} + +double +elapsed_time(long long startTimeUs, long long stopTimeUs); + +int +parseSize(const char* str, size_t* output); +int +parseUInt(const char* str, unsigned int* output); +int +parseInt(const char* str, int* output); +int +parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg); + +unsigned +setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N); + +template // pointer type +void +checkArray(T hData, T hOutputData, size_t width, size_t height, size_t depth) +{ + for([[maybe_unused]] size_t i = 0; i < depth; i++) + { + for([[maybe_unused]] size_t j = 0; j < height; j++) + { + for([[maybe_unused]] size_t k = 0; k < width; k++) + { + int offset = i * width * height + j * width + k; + if(hData[offset] != hOutputData[offset]) + { + std::cerr << '[' << i << ',' << j << ',' << k << "]:" << hData[offset] << "----" + << hOutputData[offset] << " "; + failed("mistmatch at:%d %d %d", i, j, k); + } + } + } + } +} + +template +void +checkArray(T input, T output, size_t height, size_t width) +{ + for(size_t i = 0; i < height; i++) + { + for(size_t j = 0; j < width; j++) + { + int offset = i * width + j; + if(input[offset] != output[offset]) + { + std::cerr << '[' << i << ',' << j << ',' << "]:" << input[offset] << "----" + << output[offset] << " "; + failed("mistmatch at:%d %d", i, j); + } + } + } +} + +template +__global__ void +vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for(size_t i = offset; i < NELEM; i += stride) + { + C_d[i] = A_d[i] + B_d[i]; + } +} + +template +__global__ void +vectorADDReverse(const T* A_d, const T* B_d, T* C_d, size_t NELEM) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride) + { + C_d[i] = A_d[i] + B_d[i]; + } +} + +template +__global__ void +addCount(const T* A_d, T* C_d, size_t NELEM, int count) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + // Deliberately do this in an inefficient way to increase kernel runtime + for(int i = 0; i < count; i++) + { + for(size_t i = offset; i < NELEM; i += stride) + { + C_d[i] = A_d[i] + (T) count; + } + } +} + +template +__global__ void +addCountReverse(const T* A_d, T* C_d, int64_t NELEM, int count) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + // Deliberately do this in an inefficient way to increase kernel runtime + for(int i = 0; i < count; i++) + { + for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride) + { + C_d[i] = A_d[i] + (T) count; + } + } +} + +template +__global__ void +memsetReverse(T* C_d, T val, int64_t NELEM) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride) + { + C_d[i] = val; + } +} + +template +void +setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h) +{ + // Initialize the host data: + for(size_t i = 0; i < numElements; i++) + { + if(A_h) (A_h)[i] = 3.146f + i; // Pi + if(B_h) (B_h)[i] = 1.618f + i; // Phi + if(C_h) (C_h)[i] = 0.0f + i; + } +} + +template +void +initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false) +{ + size_t Nbytes = N * sizeof(T); + + if(usePinnedHost) + { + if(A_h) + { + HIPCHECK(hipHostMalloc(reinterpret_cast(A_h), Nbytes)); + } + if(B_h) + { + HIPCHECK(hipHostMalloc(reinterpret_cast(B_h), Nbytes)); + } + if(C_h) + { + HIPCHECK(hipHostMalloc(reinterpret_cast(C_h), Nbytes)); + } + } + else + { + if(A_h) + { + *A_h = (T*) malloc(Nbytes); + HIPASSERT(*A_h != NULL); + } + + if(B_h) + { + *B_h = (T*) malloc(Nbytes); + HIPASSERT(*B_h != NULL); + } + + if(C_h) + { + *C_h = (T*) malloc(Nbytes); + HIPASSERT(*C_h != NULL); + } + } + + setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL); +} + +template +void +initArrays(T** A_d, + T** B_d, + T** C_d, + T** A_h, + T** B_h, + T** C_h, + size_t N, + bool usePinnedHost = false) +{ + size_t Nbytes = N * sizeof(T); + + if(A_d) + { + HIPCHECK(hipMalloc(A_d, Nbytes)); + } + if(B_d) + { + HIPCHECK(hipMalloc(B_d, Nbytes)); + } + if(C_d) + { + HIPCHECK(hipMalloc(C_d, Nbytes)); + } + + initArraysForHost(A_h, B_h, C_h, N, usePinnedHost); +} + +template +void +freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost) +{ + if(usePinnedHost) + { + if(A_h) + { + HIPCHECK(hipHostFree(A_h)); + } + if(B_h) + { + HIPCHECK(hipHostFree(B_h)); + } + if(C_h) + { + HIPCHECK(hipHostFree(C_h)); + } + } + else + { + if(A_h) + { + free(A_h); + } + if(B_h) + { + free(B_h); + } + if(C_h) + { + free(C_h); + } + } +} + +template +void +freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost) +{ + if(A_d) + { + HIPCHECK(hipFree(A_d)); + } + if(B_d) + { + HIPCHECK(hipFree(B_d)); + } + if(C_d) + { + HIPCHECK(hipFree(C_d)); + } + + freeArraysForHost(A_h, B_h, C_h, usePinnedHost); +} + +# if defined(__HIP_PLATFORM_AMD__) +template +void +initArrays2DPitch(T** A_d, + T** B_d, + T** C_d, + size_t* pitch_A, + size_t* pitch_B, + size_t* pitch_C, + size_t numW, + size_t numH) +{ + if(A_d) + { + HIPCHECK(hipMallocPitch((void**) A_d, pitch_A, numW * sizeof(T), numH)); + } + if(B_d) + { + HIPCHECK(hipMallocPitch((void**) B_d, pitch_B, numW * sizeof(T), numH)); + } + if(C_d) + { + HIPCHECK(hipMallocPitch((void**) C_d, pitch_C, numW * sizeof(T), numH)); + } + + HIPASSERT(*pitch_A == *pitch_B); + HIPASSERT(*pitch_A == *pitch_C) +} + +inline void +initHIPArrays(hipArray** A_d, + hipArray** B_d, + hipArray** C_d, + const hipChannelFormatDesc* desc, + const size_t numW, + const size_t numH, + const unsigned int flags) +{ + if(A_d) + { + HIPCHECK(hipMallocArray(A_d, desc, numW, numH, flags)); + } + if(B_d) + { + HIPCHECK(hipMallocArray(B_d, desc, numW, numH, flags)); + } + if(C_d) + { + HIPCHECK(hipMallocArray(C_d, desc, numW, numH, flags)); + } +} +# endif + +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +size_t +checkVectorADD(T* A_h, + T* B_h, + T* result_H, + size_t N, + bool expectMatch = true, + bool reportMismatch = true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for(size_t i = 0; i < N; i++) + { + T expected = A_h[i] + B_h[i]; + if(result_H[i] != expected) + { + if(mismatchCount == 0) + { + firstMismatch = i; + } + mismatchCount++; + if((mismatchCount <= mismatchesToPrint) && expectMatch) + { + std::cout << std::fixed << std::setprecision(32); + std::cout << "At " << i << std::endl; + std::cout << " Computed:" << result_H[i] << std::endl; + std::cout << " Expected:" << expected << std::endl; + } + } + } + + if(reportMismatch) + { + if(expectMatch) + { + if(mismatchCount) + { + failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + } + } + else + { + if(mismatchCount == 0) + { + failed("expected mismatches but did not detect any!"); + } + } + } + + return mismatchCount; +} + +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +void +checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch = true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for(size_t i = 0; i < N; i++) + { + if(result_H[i] != expected_H[i]) + { + if(mismatchCount == 0) + { + firstMismatch = i; + } + mismatchCount++; + if((mismatchCount <= mismatchesToPrint) && expectMatch) + { + std::cout << std::fixed << std::setprecision(32); + std::cout << "At " << i << std::endl; + std::cout << " Computed:" << result_H[i] << std::endl; + std::cout << " Expected:" << expected_H[i] << std::endl; + } + } + } + + if(expectMatch) + { + if(mismatchCount) + { + fprintf(stderr, "%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + // failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + } + } + else + { + if(mismatchCount == 0) + { + failed("expected mismatches but did not detect any!"); + } + } +} + +//--- +struct Pinned +{ + static const bool isPinned = true; + static const char* str() { return "Pinned"; }; + + static void* Alloc(size_t sizeBytes) + { + void* p; + HIPCHECK(hipHostMalloc((void**) &p, sizeBytes)); + return p; + }; +}; + +//--- +struct Unpinned +{ + static const bool isPinned = false; + static const char* str() { return "Unpinned"; }; + + static void* Alloc(size_t sizeBytes) + { + void* p = malloc(sizeBytes); + HIPASSERT(p); + return p; + }; +}; + +struct Memcpy +{ + static const char* str() { return "Memcpy"; }; +}; + +struct MemcpyAsync +{ + static const char* str() { return "MemcpyAsync"; }; +}; + +template +struct MemTraits; + +template <> +struct MemTraits +{ + static void Copy(void* dest, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t) + { + HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind)); + } +}; + +template <> +struct MemTraits +{ + static void Copy(void* dest, + const void* src, + size_t sizeBytes, + hipMemcpyKind kind, + hipStream_t stream) + { + HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream)); + } +}; + +inline bool +isImageSupported() +{ + int imageSupport = 1; +# ifdef __HIP_PLATFORM_AMD__ + HIPCHECK(hipDeviceGetAttribute(&imageSupport, hipDeviceAttributeImageSupport, p_gpuDevice)); +# endif + return imageSupport != 0; +} + +}; // namespace HipTest + +// This must be called in the beginning of image test app's main() to indicate whether image +// is supported. +# define checkImageSupport() \ + if(!HipTest::isImageSupported()) \ + { \ + printf("Texture is not support on the device. Skipped.\n"); \ + passed(); \ + } +#endif //__cplusplus + +// Function to determine if the device is of gfx11 architecture +bool +IsGfx11(); diff --git a/projects/rocprofiler-sdk/benchmark/source/bin/rocprofv3-benchmark.py b/projects/rocprofiler-sdk/benchmark/source/bin/rocprofv3-benchmark.py new file mode 100755 index 0000000000..307a5be015 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/bin/rocprofv3-benchmark.py @@ -0,0 +1,1487 @@ +#!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import os +import re +import sys +import json +import math +import yaml +import random +import shutil +import hashlib +import argparse +import datetime +import textwrap +import subprocess + +# global variable that should never be modified during runtime +CONST_METRIC_LIST = [ + "wall_time", + "cpu_time", + "cpu_util", + "peak_rss", + "page_rss", + "virtual_memory", + "major_page_faults", + "minor_page_faults", + "priority_context_switches", + "voluntary_context_switches", +] + +CONST_ROCPROFV3_PROFILE_LIST = [ + "hip_compiler_api", + "hip_runtime_api", + "hsa_api", + "kernel_dispatch", + "marker_api", + "memory_allocation", + "memory_copy", + "ompt", + "rccl_api", + "rocdecode_api", + "rocjpeg_api", + "scratch_memory", +] + +# global variable that should never be modified during runtime. +# Key is database metric name, mapped value is timem json label +CONST_TIMEM_METRIC_MAP = { + "wall_time": "wall_clock", + "cpu_time": "cpu_clock", + "cpu_util": "cpu_util", + "peak_rss": "peak_rss", + "page_rss": "page_rss", + "virtual_memory": "virtual_memory", + "major_page_faults": "num_major_page_faults", + "minor_page_faults": "num_minor_page_faults", + "priority_context_switches": "priority_context_switch", + "voluntary_context_switches": "voluntary_context_switch", +} + +# global variable that should never be modified during runtime. +# Key is database metric name, mapped value is units +CONST_METRIC_UNIT_MAP = { + "wall_time": "sec", + "cpu_time": "sec", + "cpu_util": "%", + "peak_rss": "MB", + "page_rss": "MB", + "virtual_memory": "MB", + "major_page_faults": "", + "minor_page_faults": "", + "priority_context_switches": "", + "voluntary_context_switches": "", +} + +CONST_VERSION_INFO = { + "version": "@FULL_VERSION_STRING@", + "git_revision": "@ROCPROFILER_SDK_GIT_REVISION@", + "library_arch": "@CMAKE_LIBRARY_ARCHITECTURE@", + "system_name": "@CMAKE_SYSTEM_NAME@", + "system_processor": "@CMAKE_SYSTEM_PROCESSOR@", + "system_version": "@CMAKE_SYSTEM_VERSION@", + "compiler_id": "@CMAKE_CXX_COMPILER_ID@", + "compiler_version": "@CMAKE_CXX_COMPILER_VERSION@", +} + +# global variable(s) that should never be modified during runtime. +CONST_THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +CONST_LOCAL_TIMEM_PATH = os.path.join(CONST_THIS_DIR, "timem") + + +def get_linked_libraries(executable_path): + """ + Run `ldd` on the given executable and return a list of linked library paths. + + :param executable_path: Path to the ELF executable. + :return: List of Path objects to the linked libraries. + """ + try: + result = subprocess.run( + ["ldd", executable_path], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + except subprocess.CalledProcessError as e: + sys.stderr.write(f"ldd failed: {e}\n") + sys.stderr.flush() + return [] + + libraries = [] + for line in result.stdout.splitlines(): + # Match lines like: "\tlibc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f8a3b2e5000)" + match = re.search(r"=>\s+(\S+)", line) + if match: + lib_path = match.group(1) + if lib_path != "not": + libraries.append(os.path.realpath(lib_path)) + else: + # Match lines like: "/lib64/ld-linux-x86-64.so.2 (0x00007f8a3b5de000)" + match_direct = re.match(r"^\s*(/[^ ]+)", line) + if match_direct: + libraries.append(os.path.realpath(match_direct.group(1))) + + return libraries + + +def get_needed_libraries(executable_path): + """ + Extract DT_RPATH, DT_RUNPATH, and DT_NEEDED entries (shared library names) from an ELF file. + + :param executable_path: Path to the ELF executable. + :return: List of RPATHs, list of RUNPATHs, and list of shared library names (e.g. 'libc.so.6'). + """ + from elftools.elf.elffile import ELFFile + from elftools.elf.dynamic import DynamicSection + + rpath = [] + runpath = [] + needed = [] + with open(executable_path, "rb") as f: + elf = ELFFile(f) + for section in elf.iter_sections(): + if isinstance(section, DynamicSection): + for tag in section.iter_tags(): + if tag.entry.d_tag == "DT_NEEDED": + needed.append(tag.needed) + elif tag.entry.d_tag == "DT_RUNPATH": + runpath += tag.runpath.split(":") + elif tag.entry.d_tag == "DT_RPATH": + rpath += tag.rpath.split(":") + + return (rpath, runpath, needed) + + +def resolve_library(target, name, rpath, ld_library_path, runpath): + """ + Attempt to resolve a shared library name to its full path RPATH, LD_LIBRARY_PATH, RUNPATH, and lastly, `ldconfig -p`. + + :param target: Path to target executable (source of DT_NEEDED) + :param name: Shared library name (e.g., 'libc.so.6') + :param rpath: List of RPATH directories + :param ld_library_path: List of LD_LIBRARY_PATH directories + :param runpath: List of RUNPATH directories + :return: Path to library or None if not found + """ + + def _verify_resolved_libpath(_libpath): + _libpath = _libpath.replace("$ORIGIN", os.path.realpath(os.path.dirname(target))) + _val = os.path.join(_libpath, name) + if os.path.exists(_val) and os.path.isfile(_val): + return os.path.realpath(_val) + return None + + for path_set in [rpath, ld_library_path, runpath]: + for libpath in path_set: + lib = _verify_resolved_libpath(libpath) + if lib is not None: + return lib + + result = subprocess.run(["ldconfig", "-p"], stdout=subprocess.PIPE, text=True) + for line in result.stdout.splitlines(): + if name in line: + parts = [itr.strip() for itr in line.strip().split(" ")] + if ( + len(parts) >= 2 + and os.path.exists(parts[-1]) + and os.path.basename(parts[-1]) == name + ): + return os.path.realpath(parts[-1]) + + return None + + +def get_executable_depends(target, ld_library_path): + deps = [] + + try: + deps += get_linked_libraries(target) + except Exception as e: + sys.stderr.write(f"'{target}' raised an exception: {e}\n") + finally: + sys.stderr.flush() + + if len(deps) == 0: + try: + from elftools.common.exceptions import ELFError + + # if ldd failed, try using pyelftools + rpath, runpath, needed = get_needed_libraries(target) + for nitr in needed: + resolved_nitr = resolve_library( + target, nitr, rpath, ld_library_path, runpath + ) + if resolved_nitr is not None: + deps += [resolved_nitr] + except ELFError: + sys.stderr.write( + f"'{target}' is not an ELF file. md5sum changes will be unreliable\n" + ) + except Exception as e: + sys.stderr.write( + f"'{target}' raised an exception (ignoring): {e}. md5sum changes may be unreliable\n" + ) + finally: + sys.stderr.flush() + + return deps + + +def compute_hash(data): + _data = f"{data}" if not isinstance(data, str) else data + return hashlib.md5(_data.encode()).hexdigest() + + +def log_message(*args): + + sys.stdout.write("\n####\n") + sys.stdout.write("#### ") + sys.stdout.write(*args) + sys.stdout.write("\n####\n") + sys.stdout.flush() + + +def patch_message(msg, *args): + msg = textwrap.dedent(msg) + + if len(args) > 0: + msg = msg.format(*args) + + return msg.strip("\n").strip() + + +def fatal_error(msg, *args, exit_code=1): + msg = patch_message(msg, *args) + sys.stderr.write(f"Fatal error: {msg}\n") + sys.stderr.flush() + sys.exit(exit_code) + + +# Standard deviation function (sample stddev, i.e., n-1 in denominator) +def stddev_samp(values): + n = len(values) + if n < 2: + return None + mean = sum(values) / n + variance = sum((x - mean) ** 2 for x in values) / (n - 1) + return math.sqrt(variance) + + +# Aggregate class for use with sqlite3 +class StdDevSamp: + def __init__(self): + self.values = [] + + def step(self, value): + if value is not None: + self.values.append(value) + + def finalize(self): + return stddev_samp(self.values) + + +# Function to connect to the MySQL database +def connect_to_database(args): + + if args.db_backend == "sqlite3": + import sqlite3 + + connection = sqlite3.connect(args.sqlite3_database) + connection.create_aggregate("STDDEV_SAMP", 1, StdDevSamp) + + elif args.db_backend == "mysql": + import mysql.connector + + mysql_config = { + "user": args.mysql_user, + "password": args.mysql_passwd, + "host": args.mysql_host, + "database": args.mysql_database, + } + connection = mysql.connector.connect(**mysql_config) + else: + raise ValueError(f"unhandled database backend {args.db_backend}") + + # backend agnostic + cursor = connection.cursor() + + # initial setup + if args.db_backend == "sqlite3": + cursor.execute("PRAGMA foreign_keys = ON") + elif args.db_backend == "mysql": + cursor.execute(f"CREATE DATABASE IF NOT EXISTS {args.mysql_database}") + cursor.execute(f"USE {args.mysql_database}") + else: + raise ValueError(f"unhandled database backend {args.db_backend}") + + def execute_db_script(script_file, backend, repl): + with open(script_file, "r") as ifs: + data = ifs.read() + for rkey, ritr in repl.items(): + data = data.replace(rkey, ritr) + + if backend == "sqlite3": + cursor.executescript(data) + elif backend == "mysql": + statements = data.split(";") + for statement in statements: + statement = statement.strip() + if statement: + cursor.execute(statement) + else: + raise ValueError(f"unhandled database backend {backend}") + + # SQL implementation agnostic schema + for schema in args.db_schema: + repl = {} + if args.db_backend == "sqlite3": + repl = { + " INT ": " INTEGER ", + "AUTO_INCREMENT": "AUTOINCREMENT", + '("{}")': '"{}"', + '("[]")': '"[]"', + } + execute_db_script(schema, args.db_backend, repl) + + # SQL views substitute {{metric}} syntax with metric list + for views in args.db_views: + for metric in CONST_METRIC_LIST: + repl = {"{{metric}}": f"{metric}"} + execute_db_script(views, args.db_backend, repl) + + return connection, cursor + + +def find_benchmarked_app(cursor, cmd, launcher, environment): + """ + Insert benchmarked application + Args: + cursor: + Database cursor + cmd: + Command line executed + launcher: list + Command line launcher + environment: dict + Environment variables + Returns: + id: row id of matching benchmarked application or None + """ + + cmd_json = json.dumps(cmd) + ld_library_path = environment.get("LD_LIBRARY_PATH", "").split(":") + + # get the executable file + exe_file = cmd[0] + if not os.path.exists(exe_file): + exe_file = shutil.which(exe_file) + + exe_data = [] + with open(exe_file, "rb") as ifs: + exe_data += [ifs.read()] + + # factor in the md5sum of the linked libraries + for dep in get_executable_depends(exe_file, ld_library_path): + with open(dep, "rb") as ifs: + exe_data += [ifs.read()] + + md5sum = compute_hash("".join([f"{itr}" for itr in exe_data])) + + hash_id = compute_hash( + f"{launcher}{cmd_json}{environment}".replace("\n", "").replace(" ", "") + ) + + select_stmt = f"SELECT id FROM benchmarked_app WHERE hash_id = '{hash_id}' AND md5sum = '{md5sum}'" + + cursor.execute(select_stmt) + existing_id = cursor.fetchone() + if existing_id is not None: + return existing_id[0] + + return None + + +def insert_benchmarked_app(cursor, cmd, launcher, environment, app_id, profile, args): + """ + Insert benchmarked application + Args: + cursor: + Database cursor + cmd: list + Command line executed + launcher: list + Command line launcher + environment: dict + Environment variables + app_id: integer + Return value of find_benchmarked_app + profile: dict + Return value of execute_profile + args: + Command line args (argparse.Namespace) + Returns: + id: row id of the insert + """ + + cmd_json = json.dumps(cmd) + env_json = json.dumps(environment) + ld_library_path = environment.get("LD_LIBRARY_PATH", "").split(":") + + # get the executable file + exe_file = cmd[0] + if not os.path.exists(exe_file): + exe_file = shutil.which(exe_file) + + exe_data = [] + with open(exe_file, "rb") as ifs: + exe_data += [ifs.read()] + + # factor in the md5sum of the linked libraries + for dep in get_executable_depends(exe_file, ld_library_path): + with open(dep, "rb") as ifs: + exe_data += [ifs.read()] + + md5sum = compute_hash("".join([f"{itr}" for itr in exe_data])) + + hash_id = compute_hash( + f"{launcher}{cmd_json}{environment}".replace("\n", "").replace(" ", "") + ) + + select_stmt = f"SELECT id FROM benchmarked_app WHERE hash_id = '{hash_id}' AND md5sum = '{md5sum}'" + + cursor.execute(select_stmt) + existing_id = cursor.fetchone() + if existing_id is not None: + assert ( + app_id is not None + ), "insert_benchmarked_app should have been passed a non-null app_id from find_benchmarked_app" + if app_id != existing_id[0]: + raise RuntimeError( + f"find_benchmarked_app found app_id={app_id} but insert_benchmarked_app found app_id={existing_id[0]}" + ) + return existing_id[0] + + if profile is None: + raise RuntimeError( + f"insert_benchmarked_app requires an application profile from rocprofv3. cmd: {cmd}" + ) + + values = ( + [ + "hash_id", + "md5sum", + "command", + "threads", + ] + + CONST_ROCPROFV3_PROFILE_LIST + + [ + "environment", + ] + ) + + # make sure the dict indexing in cursor.execute succeeds + for col in CONST_ROCPROFV3_PROFILE_LIST: + if col not in profile: + profile[col] = {"count": None, "unique": None} + + # combine HSA and Marker APIs into "hsa_api" and "marker_api" entries + for key, itr in profile.items(): + for api in ["hsa", "marker"]: + if re.match(f"^{api}_([a-z_]+)_api$", key): + for col in ["count", "unique"]: + if profile[f"{api}_api"][col] is None: + profile[f"{api}_api"][col] = 0 + profile[f"{api}_api"][col] += itr[col] + + insert_stmt = "INSERT INTO benchmarked_app ({}) VALUES ({})".format( + ", ".join(values), ", ".join([args.db_placeholder for _ in values]) + ) + + cursor.execute( + insert_stmt, + [ + hash_id, + md5sum, + cmd_json, + profile["threads"], + ] + + [profile[itr]["count"] for itr in CONST_ROCPROFV3_PROFILE_LIST] + + [env_json], + ) + + return cursor.lastrowid + + +def insert_benchmark_config(cursor, sdk_id, config_record, args): + """ + Insert rocprofiler-sdk information + Args: + cursor: + Database cursor + config_record: + configuration info dict from rocprofv3 config json. + args: + Command line args (argparse.Namespace) + Returns: + id: row id of the insert + """ + + if config_record is not None: + if "rocprofiler-sdk-tool" in config_record: + config_record = config_record["rocprofiler-sdk-tool"][0] + + if "metadata" in config_record: + config_record = config_record["metadata"] + + config_record = config_record["config"] + else: + config_record = {"benchmark_mode": "baseline"} + + info_mapping = { + "benchmark_mode": [], + "kernel_rename": [], + "group_by_queue": [], + "kernel_trace": [], + "hsa_trace": [ + "hsa_core_api_trace", + "hsa_amd_ext_api_trace", + "hsa_image_ext_api_trace", + "hsa_finalizer_ext_api_trace", + ], + "hip_runtime_trace": ["hip_runtime_api_trace"], + "hip_compiler_trace": ["hip_compiler_api_trace"], + "marker_trace": ["marker_api_trace"], + "memory_copy_trace": [], + "memory_allocation_trace": [], + "scratch_memory_trace": [], + "rccl_trace": ["rccl_api_trace"], + "rocdecode_trace": ["rocdecode_api_trace"], + "rocjpeg_trace": ["rocjpeg_api_trace"], + "dispatch_counter_collection": ["counter_collection"], + "pc_sampling_host_trap": [], + "pc_sampling_stocastic": [], + "advanced_thread_trace": [], + "pmc_counters": ["counters"], + } + + name_mapping = { + "hsa_trace": "HSA", + "hip_runtime_trace": "HIP (Runtime)", + "hip_compiler_trace": "HIP (Compiler)", + "marker_trace": "ROCTx", + "group_by_queue": "Group-By-Queue", + "memory_allocation_trace": "Memory Alloc", + "rccl_trace": "RCCL", + "rocdecode_trace": "rocDecode", + "rocjpeg_trace": "rocJPEG", + "pc_sampling_host_trap": "PC Sampling (Host Trap)", + "pc_sampling_stocastic": "PC Sampling (Stocastic)", + "advanced_thread_trace": "ATT", + "pmc_counters": "PMC", + } + + def patch_key(val): + if val in name_mapping: + return name_mapping[val] + return " ".join( + [itr.capitalize() for itr in val.lower().replace("_trace", "").split("_")] + ) + + data = {} + labels = [] + for col, itr in info_mapping.items(): + if not itr: + itr = [col] + + for citr in itr: + if citr in config_record and config_record[citr]: + val = config_record[citr] + if isinstance(val, list): + data[col] = json.dumps(*val) + labels.append(f"{patch_key(col)}={val}") + elif isinstance(val, dict): + data[col] = json.dumps(val) + labels.append(f"{patch_key(col)}={val}") + elif isinstance(val, bool): + data[col] = config_record[citr] + labels.append(patch_key(col)) + elif isinstance(val, str) and col == "benchmark_mode": + data[col] = config_record[citr] + else: + raise ValueError( + f"unexpected data type for column '{col}': {type(config_record[citr]).__name__}" + ) + break + + data_json = json.dumps(data) + + label = " + ".join(sorted(labels)) + + hash_id = compute_hash(f"{data_json}".replace("\n", "").replace(" ", "")) + + select_stmt = f"SELECT id FROM benchmark_config WHERE hash_id = '{hash_id}'" + + cursor.execute(select_stmt) + existing_id = cursor.fetchone() + if existing_id is not None: + return existing_id[0] + + columns = ["hash_id", "sdk_id", "label"] + list(data.keys()) + insert_stmt = "INSERT INTO benchmark_config ({}) VALUES ({})".format( + ", ".join(columns), ", ".join([args.db_placeholder for _ in columns]) + ) + + values = [hash_id, sdk_id, label] + [itr for _, itr in data.items()] + cursor.execute(insert_stmt, values) + + return cursor.lastrowid + + +def insert_benchmarked_sdk(cursor, config_record, args): + """ + Insert rocprofiler-sdk information + Args: + cursor: + Database cursor + config_record: + configuration info dict from rocprofv3 config json. + args: + Command line args (argparse.Namespace) + Returns: + id: row id of the insert + """ + + if "rocprofiler-sdk-tool" in config_record: + config_record = config_record["rocprofiler-sdk-tool"][0] + + if "metadata" in config_record: + config_record = config_record["metadata"] + + build_spec = config_record["build_spec"] + + hash_id = compute_hash(f"{build_spec}".replace("\n", "").replace(" ", "")) + + select_stmt = f"SELECT id FROM benchmarked_sdk WHERE hash_id = '{hash_id}'" + + cursor.execute(select_stmt) + existing_id = cursor.fetchone() + if existing_id is not None: + return existing_id[0] + + spec_columns = [ + "version_major", + "version_minor", + "version_patch", + "soversion", + "compiler_id", + "compiler_version", + "git_revision", + "library_arch", + "system_name", + "system_processor", + "system_version", + ] + + columns = ["hash_id"] + spec_columns + insert_stmt = "INSERT INTO benchmarked_sdk ({}) VALUES ({})".format( + ", ".join(columns), ", ".join([args.db_placeholder for _ in columns]) + ) + + values = [build_spec[itr] for itr in spec_columns] + cursor.execute(insert_stmt, (hash_id, *values)) + + return cursor.lastrowid + + +def insert_performance_metrics( + cursor, app_id, sdk_id, cfg_id, executed_at, perf_record, args +): + """ + Insert performance metric record + Args: + cursor: + Database cursor + app_id: + application database row index + sdk_id: + rocprofiler-sdk database row index + cfg_id: + configuration info dict from rocprofv3 config json. + This value should be None if it is a baseline measurement + executed_at: + UTC date and time, e.g. 2025-05-13 18:25:37.554604+00:00 + perf_record: + performance data dict from timem json + args: + Command line args (argparse.Namespace) + Returns: + id: row id of the insert + """ + + # Insert query matching the schema + values = [ + "app_id", + "cfg_id", + "sdk_id", + "executed_at", + ] + CONST_METRIC_LIST + + insert_stmt = "INSERT INTO benchmark_metrics ({}) VALUES ({})".format( + ", ".join(values), ", ".join([args.db_placeholder for _ in values]) + ) + + _record = perf_record["timemory"]["timem"][0] + + # timem records are _record[]["value"] so use the CONST_TIMEM_METRIC_MAP dict to map + # the database metric name to the dict entry in the timem record, e.g. wall_time is + # located in _record["wall_clock"]["value"] + values = [app_id, cfg_id, sdk_id, executed_at] + [ + _record[CONST_TIMEM_METRIC_MAP[itr]]["value"] for itr in CONST_METRIC_LIST + ] + + cursor.execute(insert_stmt, values) + + return cursor.lastrowid + + +def execute_profile( + cmd, + args, + env=dict(os.environ), + launcher=[], +): + + _argt = os.path.basename(cmd[0]) + _pid = os.getpid() + _data_dir = args.data_dir + _cmd = cmd + + _config_file = None + _rocprof_out = f"profile-{_pid}-{_argt}" + _config_file = f"{_data_dir}/{_rocprof_out}_config.json" + _cmd = ( + [ + args.rocprofv3, + "--output-config", + "-d", + _data_dir, + "-o", + _rocprof_out, + "--benchmark-mode", + "execution-profile", + "--sys-trace", + ] + + ["--"] + + _cmd + ) + + log_message("Executing profile run '{}'".format(" ".join(_cmd))) + + _cmd = launcher + _cmd + + exit_code = subprocess.check_call(_cmd, env=env) + + if exit_code != 0: + fatal_error("Application exited with non-zero exit code", exit_code) + + _profile = None + with open(_config_file, "r") as ifs: + _cfg = json.load(ifs) + _profile = _cfg["rocprofiler-sdk-tool"][0]["metadata"]["profile"] + + if not args.keep_data: + for fname in [_config_file]: + if fname is not None and os.path.exists(fname): + try: + os.remove(fname) + except Exception as e: + sys.stderr.write(f"Error removing {fname}: {e}\n") + sys.stderr.flush() + + return _profile + + +def execute_run( + cursor, + benchmark_mode, + rocprofv3_args, + cmd, + app_id, + nitr, + args, + env=dict(os.environ), + launcher=[], + is_warmup=False, +): + + _argt = os.path.basename(cmd[0]) + _pid = os.getpid() + _data_dir = args.data_dir + _cmd = cmd + + _timem_file = f"{_data_dir}/perf-{_pid}-{_argt}-{nitr}" + _config_file = None + + if rocprofv3_args: + _rocprof_out = f"out-{_pid}-{_argt}-{nitr}" + _config_file = f"{_data_dir}/{_rocprof_out}_config.json" + _cmd = ( + [ + args.rocprofv3, + "--output-config", + "-d", + _data_dir, + "-o", + _rocprof_out, + "--benchmark-mode", + benchmark_mode, + ] + + rocprofv3_args + + ["--"] + + _cmd + ) + + if is_warmup: + log_message("Executing warmup iteration '{}'".format(" ".join(launcher + _cmd))) + else: + log_message("Executing '{}'".format(" ".join(_cmd))) + + _cmd = launcher + [args.timem, "-o", _timem_file, "--"] + _cmd + _now = datetime.datetime.now(datetime.timezone.utc) + + exit_code = subprocess.check_call(_cmd, env=env) + + if exit_code != 0: + fatal_error("Application exited with non-zero exit code", exit_code) + + if not is_warmup: + with open(f"{_timem_file}.json", "r") as ifs: + _perf = json.load(ifs) + + if _config_file is not None: + with open(_config_file, "r") as ifs: + _cfg = json.load(ifs) + _sdk_id = insert_benchmarked_sdk(cursor, _cfg, args) + _cfg_id = insert_benchmark_config(cursor, _sdk_id, _cfg, args) + else: + _sdk_id = None + _cfg_id = insert_benchmark_config(cursor, _sdk_id, None, args) + + _perf_id = insert_performance_metrics( + cursor, app_id, _sdk_id, _cfg_id, _now, _perf, args + ) + + log_message( + "Inserted benchmarked sdk (app id {}, cfg id {}, sdk id {}, metrics id {})".format( + app_id, _cfg_id, _sdk_id, _perf_id + ) + ) + else: + _sdk_id = None + _perf_id = None + + if not args.keep_data: + for fname in [f"{_timem_file}.json", _config_file]: + if fname is not None and os.path.exists(fname): + try: + os.remove(fname) + except Exception as e: + sys.stderr.write(f"Error removing {fname}: {e}\n") + sys.stderr.flush() + + return (_sdk_id, _perf_id) + + +def parse_args(argv=None): + """ + Parse the command line arguments + Args: + argv: + Command line arguments + Returns: + argparse.Namespace + """ + + if argv is None: + argv = sys.argv[1:] + + # default path to timem if user doesn't provide it + default_timem_path = ( + CONST_LOCAL_TIMEM_PATH if os.path.exists(CONST_LOCAL_TIMEM_PATH) else "timem" + ) + + # Set up command-line argument parsing + parser = argparse.ArgumentParser( + description="Execute one or more applications", + allow_abbrev=False, + ) + + parser.add_argument( + "--version", + action="store_true", + help="Print the version information and exit", + ) + + parser.add_argument( + "-i", + "--input", + type=str, + help="Path to YAML input file", + ) + + parser.add_argument( + "-n", + "--num-iterations", + type=int, + help="Number of iterations to run", + default=3, + ) + + parser.add_argument( + "-w", + "--num-warmup-iterations", + type=int, + help="Number of warmup iterations to run", + default=1, + ) + + parser.add_argument( + "--data-dir", + type=str, + help="Directory of the timem and rocprofv3 output", + default="/tmp/rocprofv3-benchmark", + metavar="PATH", + ) + + parser.add_argument( + "--keep-data", + action="store_true", + help="Do not delete the timem and rocprofv3 output", + ) + + exe_options = parser.add_argument_group("Executable options") + + exe_options.add_argument( + "--timem", + type=str, + help="Path to timem executable", + default=default_timem_path, + metavar="FILEPATH", + ) + + exe_options.add_argument( + "--rocprofv3", + type=str, + help="Path to rocprofv3 executable", + default="rocprofv3", + metavar="FILEPATH", + ) + + database_options = parser.add_argument_group("Database options (generic)") + + database_options.add_argument( + "--db-backend", + type=str, + help="Select the database backend", + choices=("sqlite3", "mysql"), + default="sqlite3", + ) + + database_options.add_argument( + "--db-schema", + type=str, + help="Database schema file(s)", + nargs="+", + default=[ + os.path.join( + CONST_THIS_DIR, "..", "share", "rocprofiler-sdk", "benchmark_tables.sql" + ) + ], + metavar="FILEPATH", + ) + + database_options.add_argument( + "--db-views", + type=str, + help="Database views file(s)", + nargs="+", + default=[ + os.path.join( + CONST_THIS_DIR, "..", "share", "rocprofiler-sdk", "benchmark_views.sql" + ) + ], + metavar="FILEPATH", + ) + + job_filter_options = parser.add_argument_group("Job filter options") + + job_filter_options.add_argument( + "--name", + "--filter-name", + type=str, + nargs="+", + help="Run a specific job name", + default=None, + metavar="REGEX", + dest="filter_name", + ) + + job_filter_options.add_argument( + "--group", + "--filter-group", + type=str, + nargs="+", + help="Run jobs in the designated group(s)", + default=None, + metavar="REGEX", + dest="filter_group", + ) + + job_filter_options.add_argument( + "--filter-rocprofv3", + type=str, + nargs="+", + help="Run rocprofv3 jobs with matching rocprofv3 config label(s)", + default=None, + metavar="REGEX", + ) + + job_filter_options.add_argument( + "--filter-benchmark", + type=str, + nargs="+", + choices=( + "baseline", + "disabled-sdk-contexts", + "sdk-buffer-overhead", + "sdk-callback-overhead", + "tool-runtime-overhead", + ), + help="Run rocprofv3 jobs with matching rocprofv3 config label(s)", + default=None, + metavar="REGEX", + ) + + shuffle_options = parser.add_argument_group("Randomization options") + + shuffle_options.add_argument( + "--shuffle", + action="store_true", + help="Randomly shuffle the ordering of the benchmark modes and rocprofv3 parameters", + ) + + shuffle_options.add_argument( + "--shuffle-rocprofv3", + action="store_true", + help="Randomly shuffle the ordering of the rocprofv3 parameters", + ) + + shuffle_options.add_argument( + "--shuffle-benchmark", + action="store_true", + help="Randomly shuffle the ordering of the benchmark modes", + ) + + sqlite3_options = parser.add_argument_group("SQLite3 options") + + sqlite3_options.add_argument( + "--sqlite3-database", + help="SQLite3 database filename", + type=str, + default="benchmark.db", + metavar="FILENAME", + ) + + mysql_options = parser.add_argument_group("MySQL options") + + mysql_options.add_argument( + "--mysql-user", + type=str, + help="Database user (MySQL only)", + default="root", + metavar="USERNAME", + ) + mysql_options.add_argument( + "--mysql-passwd", + type=str, + help="Database password (MySQL only)", + default=None, + metavar="PASSWORD", + ) + mysql_options.add_argument( + "--mysql-host", + type=str, + help="Database remote host (MySQL only)", + default="db.rocprofiler-benchmarking.svc.cluster.local", + metavar="URL", + ) + mysql_options.add_argument( + "--mysql-database", + type=str, + help="Database name (MySQL only)", + default="benchmark_db", + metavar="DATABASE_NAME", + ) + + args = parser.parse_args(argv) + + if args.db_backend == "sqlite3": + setattr(args, "db_placeholder", "?") + elif args.db_backend == "mysql": + setattr(args, "db_placeholder", "%s") + else: + raise ValueError(f"db_placeholder needs to be specified for {args.db_backend}") + + return args + + +def execute_input(connection, cursor, args): + """ + Executes YAML input + """ + + def apply_filter(filters, configs): + if not isinstance(filters, list): + filters = [filters] + if not isinstance(configs, list): + configs = [configs] + + matching = [] + for fitr in filters: + matching += [itr for itr in configs if re.match(fitr, itr)] + return matching + + def shuffle(data): + if isinstance(data, dict): + return dict( + [[itr, data[itr]] for itr in random.sample(list(data.keys()), len(data))] + ) + return random.sample(data, len(data)) + + with open(args.input, "r") as ifs: + spec = yaml.safe_load(ifs) + + for job in spec["jobs"]: + # populate these specs from the default if they were not explicitly provided + for param in [ + "command", + "benchmark", + "rocprofv3", + "name", + "group", + "environment", + "environment-ignore", + "launcher", + ]: + if param not in job and param in spec["defaults"]: + job[param] = spec["defaults"][param] + + # job name filtering + if args.filter_name is not None and job["name"]: + if not apply_filter(args.filter_name, job["name"]): + continue + + # job group filtering + if args.filter_group and job["group"]: + if not apply_filter(args.filter_group, job["group"]): + continue + + benchmarks = job["benchmark"] if "benchmark" in job else [] + if args.filter_benchmark and benchmarks: + benchmarks = apply_filter(args.filter_benchmark, benchmarks) + + if not benchmarks: + continue + + launcher = job["launcher"] if "launcher" in job else [] + environ = dict(os.environ) + + if "environment" in job: + for key, eitr in job["environment"].items(): + environ[key] = f"{eitr}" + + if "environment-ignore" in job: + environ_keys = list(environ.keys()) + matching = [] + for eitr in job["environment-ignore"]: + matching += [ + itr for itr in environ_keys if re.match(f"^({eitr})$", itr) + ] + + for mitr in matching: + del environ[mitr] + assert environ.get(mitr, None) is None + + # env substitution + for key, eitr in dict(os.environ).items(): + # env spec has this value + if key in environ: + if key in environ[key]: + environ[key] = ( + environ[key] + .replace(f"${key}", eitr) + .replace("${}{}{}".format("{", key, "}"), eitr) + ) + + launcher = [f"{itr}" for itr in launcher] + cmd = [f"{itr}" for itr in job["command"]] + + app_id = find_benchmarked_app(cursor, cmd, launcher, environ) + profile = None + + if app_id is None: + profile = execute_profile( + cmd, + args, + env=environ, + launcher=launcher, + ) + + app_id = insert_benchmarked_app( + cursor, cmd, launcher, environ, app_id, profile, args + ) + + for nitr in range(0, args.num_warmup_iterations): + execute_run( + cursor, + "baseline", + None, + cmd, + app_id, + nitr, + args, + env=environ, + launcher=launcher, + is_warmup=True, + ) + + for nitr in range(0, args.num_iterations): + + if args.shuffle or args.shuffle_benchmark: + _orig_benchmarks = benchmarks[:] + benchmarks = shuffle(_orig_benchmarks) + log_message( + f"Shuffled benchmarks: {_orig_benchmarks} => {benchmarks}" + ) + + for benchmark_mode in benchmarks: + + # baseline requires special handling since it doesn't use rocprofv3 + if benchmark_mode == "baseline": + + # generate the baselines + execute_run( + cursor, + benchmark_mode, + None, + cmd, + app_id, + nitr, + args, + env=environ, + launcher=launcher, + ) + + elif benchmark_mode in ( + "disabled-sdk-contexts", + "sdk-buffer-overhead", + "sdk-callback-overhead", + "tool-runtime-overhead", + ): + + # loop over the rocprofv3 configurations + rocprofv3_args_config = job["rocprofv3"] + if isinstance(rocprofv3_args_config, (list, tuple)): + rocprofv3_args_config = dict( + [["", itr] for itr in rocprofv3_args_config] + ) + + if args.shuffle or args.shuffle_rocprofv3: + _orig_rocprofv3_cfg = rocprofv3_args_config.keys() + rocprofv3_args_config = shuffle(rocprofv3_args_config) + log_message( + f"Shuffled rocprofv3: {list(_orig_rocprofv3_cfg)} => {list(rocprofv3_args_config.keys())}" + ) + + for label, rocprofv3_args in rocprofv3_args_config.items(): + + # job name filtering + if args.filter_rocprofv3 is not None and label: + if not apply_filter(args.filter_rocprofv3, [label]): + continue + + execute_run( + cursor, + benchmark_mode, + rocprofv3_args, + cmd, + app_id, + nitr, + args, + env=environ, + launcher=launcher, + ) + + else: + raise ValueError(f"Unsupported benchmark mode: {benchmark_mode}") + + # Commit the transaction + connection.commit() + + # Commit the transaction + connection.commit() + + +def main(): + """ + Main control flow function + """ + args = parse_args(sys.argv[1:]) + + if args.version: + for key, itr in CONST_VERSION_INFO.items(): + print(f" {key:>16}: {itr}") + return + + # Connect to the database + connection, cursor = connect_to_database(args) + + if args.input is not None: + execute_input(connection, cursor, args) + + log_message("Data has been inserted into the database successfully!") + + # Generate the analysis + generate_statistics(connection, cursor, args) + + log_message("Statistics data has been successfully updated!") + + generate_analysis(connection, cursor, args) + + log_message("Analysis has been completed successfully!") + + # Close the connection + cursor.close() + connection.close() + + log_message("Connection has been closed successfully!") + + +def generate_statistics(connection, cursor, args): + + def construct_where_condition(data, conditional="AND"): + conditions = [] + + for key, value in data.items(): + if value is None: + conditions.append(f"{key} IS NULL") + else: + if isinstance(value, str): + value_str = f"'{value}'" + else: + value_str = str(value) + conditions.append(f"{key} = {value_str}") + + return f" {conditional} ".join(conditions) + + select_stmt = "SELECT DISTINCT app_id, cfg_id, sdk_id FROM benchmark_metrics" + cursor.execute(select_stmt) + + selection_stmts = [] + for row in cursor.fetchall(): + params = dict(zip(["app_id", "cfg_id", "sdk_id"], row)) + where_cond = construct_where_condition(params) + + for metric in CONST_METRIC_LIST: + selections = ", ".join( + [ + f"{itr}({metric})" + for itr in ["COUNT", "SUM", "AVG", "MIN", "MAX", "STDDEV_SAMP"] + ] + ) + select_stmt = f"SELECT {selections} FROM benchmark_metrics WHERE {where_cond}" + selection_stmts.append([metric, params, select_stmt]) + + for metric, params, select_stmt in selection_stmts: + cursor.execute(select_stmt) + + rows = cursor.fetchall() + if len(rows) != 1: + fatal_error( + f"select statement should only have returned one row. found {len(rows)}. {metric} :: {params}" + ) + + stats = ["count", "sum", "mean", "min", "max", "std_dev"] + data = dict(zip(stats, rows[0])) + + where_cond = "metric_name LIKE '{}' AND {}".format( + metric, construct_where_condition(params) + ) + + existing_stmt = f"SELECT id FROM benchmark_statistics WHERE {where_cond}" + + cursor.execute(existing_stmt) + existing_id = cursor.fetchone() + if existing_id is not None: + + columns = stats[:] + insert_stmt = "UPDATE benchmark_statistics SET {} WHERE {}".format( + ", ".join([f"{itr} = {args.db_placeholder}" for itr in columns]), + where_cond, + ) + + values = [data[itr] for itr in stats] + cursor.execute(insert_stmt, values) + + else: + + columns = ["app_id", "cfg_id", "sdk_id", "metric_name", "metric_unit"] + stats + insert_stmt = "INSERT INTO benchmark_statistics ({}) VALUES ({})".format( + ", ".join(columns), ", ".join([args.db_placeholder for _ in columns]) + ) + + values = [ + params["app_id"], + params["cfg_id"], + params["sdk_id"], + metric, + CONST_METRIC_UNIT_MAP[metric], + ] + [data[itr] for itr in stats] + + cursor.execute(insert_stmt, values) + + # Commit the transaction + connection.commit() + + +def generate_analysis(connection, cursor, args): + pass + + +# Example usage +if __name__ == "__main__": + main() diff --git a/projects/rocprofiler-sdk/benchmark/source/lib/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/lib/CMakeLists.txt new file mode 100644 index 0000000000..f3c2eab054 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/lib/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# +# + +# install the downloaded timem libraries +install( + DIRECTORY ${PROJECT_BINARY_DIR}/lib64/ + DESTINATION ${CMAKE_INSTALL_LIB64DIR} + FILES_MATCHING + PATTERN "*libtimem.*" + PATTERN "*/timemory/*") diff --git a/projects/rocprofiler-sdk/benchmark/source/share/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/share/CMakeLists.txt new file mode 100644 index 0000000000..9bfe2269d0 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/share/CMakeLists.txt @@ -0,0 +1,5 @@ +# +# +# + +add_subdirectory(rocprofiler-sdk) diff --git a/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/CMakeLists.txt new file mode 100644 index 0000000000..38e4a44367 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/CMakeLists.txt @@ -0,0 +1,12 @@ +# +# +# + +set(DATA_FILES benchmark_tables.sql benchmark_views.sql) + +foreach(_FILE ${DATA_FILES}) + configure_file(${_FILE} ${CMAKE_DATAROOT_OUTPUT_DIRECTORY}/rocprofiler-sdk/${_FILE} + COPYONLY) +endforeach() + +install(FILES ${DATA_FILES} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-sdk) diff --git a/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_tables.sql b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_tables.sql new file mode 100644 index 0000000000..bbd365d3d9 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_tables.sql @@ -0,0 +1,137 @@ +-- Application used for benchmarking +-- Columns such "hip_compiler_api", ..., "scratch_memory" are +-- the number of events in the given category, e.g. kernel_dispatch +-- represents the number of kernel dispatches in the app. These +-- can be approximate since for a given application, the exact +-- count may vary. +CREATE TABLE IF NOT EXISTS + `benchmarked_app` ( + id INT PRIMARY KEY AUTO_INCREMENT UNIQUE, + hash_id TEXT NOT NULL, + md5sum TEXT NOT NULL, + revision INT DEFAULT 0, + command JSON NOT NULL, + compiler_id TEXT, + compiler_version TEXT, + library_arch TEXT, + system_name TEXT, + system_processor TEXT, + system_version TEXT, + threads INT, + hip_compiler_api INT, + hip_runtime_api INT, + hsa_api INT, + kernel_dispatch INT, + marker_api INT, + memory_allocation INT, + memory_copy INT, + ompt INT, + rccl_api INT, + rocdecode_api INT, + rocjpeg_api INT, + scratch_memory INT, + environment JSON DEFAULT ("{}") + ); + +-- rocprofiler-sdk used for benchmarking +CREATE TABLE IF NOT EXISTS + `benchmarked_sdk` ( + id INT PRIMARY KEY AUTO_INCREMENT UNIQUE, + hash_id TEXT NOT NULL, + version_major INT NOT NULL, + version_minor INT NOT NULL, + version_patch INT NOT NULL, + soversion INT NOT NULL, + compiler_id TEXT NOT NULL, + compiler_version TEXT NOT NULL, + git_revision TEXT NOT NULL, + library_arch TEXT NOT NULL, + system_name TEXT NOT NULL, + system_processor TEXT NOT NULL, + system_version TEXT NOT NULL + ); + +-- rocprofiler-sdk used for benchmarking +CREATE TABLE IF NOT EXISTS + `benchmark_config` ( + id INT PRIMARY KEY AUTO_INCREMENT UNIQUE, + hash_id TEXT NOT NULL, + sdk_id INT, + label TEXT, -- name identifier + benchmark_mode TEXT CHECK ( + benchmark_mode IN ( + "baseline", + "disabled-sdk-contexts", + "sdk-buffer-overhead", + "sdk-callback-overhead", + "tool-runtime-overhead" + ) + ) NOT NULL, + kernel_rename INT, + group_by_queue INT, + kernel_trace INT, + hsa_trace INT, + hip_runtime_trace INT, + hip_compiler_trace INT, + marker_trace INT, + memory_copy_trace INT, + memory_allocation_trace INT, + scratch_memory_trace INT, + dispatch_counter_collection INT, + rccl_trace INT, + rocdecode_trace INT, + rocjpeg_trace INT, + pmc_counters JSON DEFAULT ("[]"), + pc_sampling_host_trap INT, + pc_sampling_stocastic INT, + advanced_thread_trace INT, + -- + -- Eventually, we will create tables for storing the subconfigurations for pc sampling and ATT + -- + -- pc_sampling_host_trap_config_id INT, + -- pc_sampling_stocastic_config_id INT, + -- advanced_thread_trace_config_id INT, + -- FOREIGN KEY (pc_sampling_host_trap_config_id) REFERENCES benchmark_pc_sampling_host_trap_config (id) ON UPDATE CASCADE, + -- FOREIGN KEY (pc_sampling_stocastic_config_id) REFERENCES benchmark_pc_sampling_stocastic_config (id) ON UPDATE CASCADE, + -- FOREIGN KEY (advanced_thread_trace_config_id) REFERENCES benchmark_advanced_thread_trace_config (id) ON UPDATE CASCADE + FOREIGN KEY (sdk_id) REFERENCES benchmarked_sdk (id) ON UPDATE CASCADE + ); + +-- metrics for the benchmark +CREATE TABLE IF NOT EXISTS + `benchmark_metrics` ( + id INT PRIMARY KEY AUTO_INCREMENT UNIQUE, + app_id INT NOT NULL, + cfg_id INT NOT NULL, + sdk_id INT, + executed_at TIMESTAMP NOT NULL, + wall_time DOUBLE NOT NULL, + cpu_time DOUBLE NOT NULL, + cpu_util DOUBLE NOT NULL, + peak_rss DOUBLE NOT NULL, + page_rss DOUBLE NOT NULL, + virtual_memory DOUBLE NOT NULL, + major_page_faults BIGINT NOT NULL, + minor_page_faults BIGINT NOT NULL, + priority_context_switches BIGINT NOT NULL, + voluntary_context_switches BIGINT NOT NULL, + FOREIGN KEY (app_id) REFERENCES benchmarked_app (id) ON UPDATE CASCADE, + FOREIGN KEY (cfg_id) REFERENCES benchmark_config (id) ON UPDATE CASCADE, + FOREIGN KEY (sdk_id) REFERENCES benchmarked_sdk (id) ON UPDATE CASCADE + ); + +CREATE TABLE IF NOT EXISTS + `benchmark_statistics` ( + id INT PRIMARY KEY AUTO_INCREMENT UNIQUE, + app_id INT NOT NULL, + cfg_id INT NOT NULL, + sdk_id INT, + metric_name TEXT NOT NULL, + metric_unit TEXT NOT NULL, + count INT NOT NULL, + sum DOUBLE NOT NULL, + mean DOUBLE NOT NULL, + min DOUBLE NOT NULL, + max DOUBLE NOT NULL, + std_dev DOUBLE + ); diff --git a/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_views.sql b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_views.sql new file mode 100644 index 0000000000..54d0193911 --- /dev/null +++ b/projects/rocprofiler-sdk/benchmark/source/share/rocprofiler-sdk/benchmark_views.sql @@ -0,0 +1,75 @@ +-- Analysis views used for benchmarking +CREATE VIEW IF NOT EXISTS + `benchmark_analysis_{{metric}}` AS +WITH + baseline AS ( + SELECT + * + FROM + benchmark_statistics BL + WHERE + BL.sdk_id IS NULL + AND BL.metric_name = "{{metric}}" + ) +SELECT + ST.id, + ST.app_id, + ST.cfg_id, + ST.sdk_id, + BS.git_revision, + BA.command, + ST.metric_name, + ST.metric_unit, + ST.count, + ST.mean, + ST.std_dev AS `+/-`, + BL.mean AS baseline_mean, + BL.std_dev AS `+/- (baseline)`, + ((ST.mean - BL.mean) / BL.mean) * 100 AS `overhead (%)`, + BC.benchmark_mode, + BC.label AS benchmark_label +FROM + benchmark_statistics ST + JOIN benchmark_config BC ON BC.id = ST.cfg_id + JOIN benchmarked_sdk BS ON BS.id = ST.sdk_id + JOIN benchmarked_app BA ON BA.id = ST.app_id + JOIN baseline BL ON ( + BL.app_id = ST.app_id + AND BL.metric_name = ST.metric_name + ) +WHERE + ST.metric_name = "{{metric}}" + AND ST.sdk_id IS NOT NULL +ORDER BY + `overhead (%)` DESC; + +-- benchmarked_app without environment info +CREATE VIEW IF NOT EXISTS + `benchmarked_app_without_env` AS +SELECT + id, + hash_id, + md5sum, + revision, + command, + compiler_id, + compiler_version, + library_arch, + system_name, + system_processor, + system_version, + threads, + hip_compiler_api, + hip_runtime_api, + hsa_api, + kernel_dispatch, + marker_api, + memory_allocation, + memory_copy, + ompt, + rccl_api, + rocdecode_api, + rocjpeg_api, + scratch_memory +FROM + benchmarked_app; diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_config_packaging.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_config_packaging.cmake index 01a62b2475..fced05e79c 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_config_packaging.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_config_packaging.cmake @@ -60,8 +60,8 @@ list(LENGTH ROCPROFILER_PACKAGING_COMPONENTS NUM_ROCPROFILER_PACKAGING_COMPONENT # the packages we will generate set(ROCPROFILER_COMPONENT_GROUPS "core" "docs" "tests" "roctx") -set(COMPONENT_GROUP_core_COMPONENTS "core" "development" "samples" "tools" "Development" - "Unspecified") +set(COMPONENT_GROUP_core_COMPONENTS "core" "development" "samples" "tools" "benchmark" + "Development" "Unspecified") set(COMPONENT_GROUP_docs_COMPONENTS "docs") set(COMPONENT_GROUP_tests_COMPONENTS "tests") set(COMPONENT_GROUP_roctx_COMPONENTS "roctx") @@ -87,7 +87,10 @@ set(COMPONENT_DESC_roctx "ROCm Tools Extension library and headers") set(EXPECTED_PACKAGING_COMPONENTS 6) if(ROCPROFILER_BUILD_DOCS) - set(EXPECTED_PACKAGING_COMPONENTS 7) + math(EXPR EXPECTED_PACKAGING_COMPONENTS "${EXPECTED_PACKAGING_COMPONENTS} + 1") +endif() +if(ROCPROFILER_BUILD_BENCHMARK) + math(EXPR EXPECTED_PACKAGING_COMPONENTS "${EXPECTED_PACKAGING_COMPONENTS} + 1") endif() if(NOT NUM_ROCPROFILER_PACKAGING_COMPONENTS EQUAL EXPECTED_PACKAGING_COMPONENTS) diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_formatting.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_formatting.cmake index 0906e04d50..53166811c2 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_formatting.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_formatting.cmake @@ -74,7 +74,7 @@ if(ROCPROFILER_CLANG_FORMAT_EXE set(rocp_cmake_files ${PROJECT_SOURCE_DIR}/CMakeLists.txt ${PROJECT_SOURCE_DIR}/external/CMakeLists.txt) - foreach(_DIR cmake samples source tests) + foreach(_DIR cmake samples source tests benchmark) foreach(_TYPE header_files source_files cmake_files python_files) set(${_TYPE}) endforeach() diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake index 2da7445616..9b48a0472b 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake @@ -38,6 +38,7 @@ rocprofiler_add_option(ROCPROFILER_BUILD_TESTS "Enable building the tests" ${ROCPROFILER_BUILD_CI}) rocprofiler_add_option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples" ${ROCPROFILER_BUILD_CI}) +rocprofiler_add_option(ROCPROFILER_BUILD_BENCHMARK "Enable building the benchmarks" OFF) rocprofiler_add_option( ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS "Disable adjusting for clock skew b/t CPU and GPU timestamps" OFF ADVANCED) diff --git a/projects/rocprofiler-sdk/docker/Dockerfile b/projects/rocprofiler-sdk/docker/Dockerfile new file mode 100644 index 0000000000..2ac578e224 --- /dev/null +++ b/projects/rocprofiler-sdk/docker/Dockerfile @@ -0,0 +1,15 @@ +ARG BASE_IMAGE=rocm/rocm-terminal +FROM $BASE_IMAGE + +ENV DEBIAN_FRONTEND=noninteractive + +ARG BRANCH=amd-staging + +RUN git clone -b ${BRANCH} https://github.com/ROCm/rocprofiler-sdk.git rocprofiler-sdk-source && \ + python3 -m pip install -r rocprofiler-sdk-source/requirements.txt && \ + sudo apt update && \ + sudo apt install -y libdw-dev && \ + cmake -B rocprofiler-sdk-build -DCMAKE_BUILD_TYPE=RelWithDebInfo -DROCPROFILER_BUILD_{SAMPLES,TESTS,BENCHMARK}=ON -DPython3_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$(realpath /opt/rocm) rocprofiler-sdk-source && \ + cmake --build rocprofiler-sdk-build --target all --parallel 16 && \ + sudo cmake --build rocprofiler-sdk-build --target install && \ + sudo rm -rf rocprofiler-sdk-source rocprofiler-sdk-build diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 1e1930e4c6..4d1769271e 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -248,6 +248,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins choices=("csv", "json", "pftrace", "otf2"), type=str.lower, ) + add_parser_bool_argument( + io_options, + "--output-config", + help="Generate a output file of the rocprofv3 configuration, e.g. out_config.json", + ) io_options.add_argument( "--log-level", help="Set the desired log level", @@ -518,6 +523,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins type=str, choices=("hour", "min", "sec", "msec", "usec", "nsec"), ) + add_parser_bool_argument( + filter_options, + "--selected-regions", + help="If set, rocprofv3 will only profile regions of code surrounded by roctxProfilerResume(0) and roctxProfilerPause(0)", + ) perfetto_options = parser.add_argument_group("Perfetto-specific options") @@ -592,6 +602,19 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins "--realpath", help=argparse.SUPPRESS, ) + advanced_options.add_argument( + "--benchmark-mode", + choices=( + "disabled-sdk-contexts", + "sdk-buffer-overhead", + "sdk-callback-overhead", + "tool-runtime-overhead", + "execution-profile", + ), + help=argparse.SUPPRESS, + default=None, + type=str.lower, + ) advanced_options.add_argument( "-A", "--agent-index", @@ -1034,6 +1057,7 @@ def run(app_args, args, **kwargs): update_env("ROCPROF_OUTPUT_FILE_NAME", _output_file) update_env("ROCPROF_OUTPUT_PATH", _output_path) + update_env("ROCPROF_OUTPUT_CONFIG_FILE", args.output_config, overwrite_if_true=True) if app_pass is not None and args.sub_directory is not None: app_env["ROCPROF_OUTPUT_PATH"] = os.path.join( f"{_output_path}", f"{args.sub_directory}{app_pass}" @@ -1198,6 +1222,11 @@ def run(app_args, args, **kwargs): args.truncate_kernels, overwrite_if_true=True, ) + update_env( + "ROCPROF_SELECTED_REGIONS", + args.selected_regions, + overwrite_if_true=True, + ) if args.list_avail: update_env( @@ -1241,6 +1270,24 @@ def run(app_args, args, **kwargs): args.log_level, ) + if args.benchmark_mode: + if args.benchmark_mode == "execution-profile": + if args.group_by_queue is None: + update_env( + "ROCPROF_GROUP_BY_QUEUE", + False, + overwrite=True, + ) + elif args.group_by_queue: + fatal_error( + "rocprofv3 requires --group-by-queue=false for --benchmark-mode=execution-profile" + ) + + update_env( + "ROCPROF_BENCHMARK_MODE", + args.benchmark_mode, + ) + for opt, env_val in dict( [ ["kernel_rename", "KERNEL_RENAME"], diff --git a/projects/rocprofiler-sdk/source/docs/how-to/samples.rst b/projects/rocprofiler-sdk/source/docs/how-to/samples.rst index b8c5ea2765..c7f28f30b4 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/samples.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/samples.rst @@ -46,4 +46,4 @@ To run the built samples, ``cd`` into the ``build-rocprofiler-sdk-samples`` dire ctest -V -The `-V` option enables verbose output, providing detailed information about the test execution. \ No newline at end of file +The `-V` option enables verbose output, providing detailed information about the test execution. diff --git a/projects/rocprofiler-sdk/source/docs/install/installation.rst b/projects/rocprofiler-sdk/source/docs/install/installation.rst index bf5c76911a..7ecfab20f1 100644 --- a/projects/rocprofiler-sdk/source/docs/install/installation.rst +++ b/projects/rocprofiler-sdk/source/docs/install/installation.rst @@ -126,4 +126,4 @@ If you have ROCm version 6.2 or higher installed, you can use the package manage .. code-block:: shell - $ sudo zypper install rocprofiler-sdk \ No newline at end of file + $ sudo zypper install rocprofiler-sdk diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/operators.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/operators.hpp index 4c631180fa..9580cfa5c1 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/operators.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/operators.hpp @@ -55,7 +55,7 @@ } #define ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(TYPE) \ - inline bool operator>(TYPE lhs, TYPE rhs) { return (lhs == rhs || !(lhs < rhs)); } \ + inline bool operator>(TYPE lhs, TYPE rhs) { return !(lhs == rhs || lhs < rhs); } \ inline bool operator<=(TYPE lhs, TYPE rhs) { return (lhs == rhs || lhs < rhs); } \ inline bool operator>=(TYPE lhs, TYPE rhs) { return !(lhs < rhs); } diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h index 9a6ae17af4..3711b83970 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h @@ -135,6 +135,29 @@ rocprofiler_configure_external_correlation_id_request_service( rocprofiler_external_correlation_id_request_cb_t callback, void* callback_args) ROCPROFILER_API ROCPROFILER_NONNULL(4); +/** + * @brief Query the name of the external correlation request kind. The name retrieved from this + * function is a string literal that is encoded in the read-only section of the binary (i.e. it is + * always "allocated" and never "deallocated"). + * + * @param [in] kind External correlation id request domain + * @param [out] name If non-null and the name is a constant string that does not require dynamic + * allocation, this paramter will be set to the address of the string literal, otherwise it will + * be set to nullptr + * @param [out] name_len If non-null, this will be assigned the length of the name (regardless of + * the name is a constant string or requires dynamic allocation) + * @return ::rocprofiler_status_t + * @retval ::ROCPROFILER_STATUS_ERROR_KIND_NOT_FOUND Returned if the domain id is not valid + * @retval ::ROCPROFILER_STATUS_SUCCESS Returned if a valid domain, regardless if there is a + * constant string or not. + */ +ROCPROFILER_SDK_EXPERIMENTAL +rocprofiler_status_t +rocprofiler_query_external_correlation_id_request_kind_name( + rocprofiler_external_correlation_id_request_kind_t kind, + const char** name, + uint64_t* name_len) ROCPROFILER_API; + /** * @brief Push default value for `external` field in ::rocprofiler_correlation_id_t onto stack. * diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/version.h.in b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/version.h.in index bad03569f4..aa09d574b4 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/version.h.in +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/version.h.in @@ -94,7 +94,7 @@ #define ROCPROFILER_VERSION_MAJOR @PROJECT_VERSION_MAJOR@ #define ROCPROFILER_VERSION_MINOR @PROJECT_VERSION_MINOR@ #define ROCPROFILER_VERSION_PATCH @PROJECT_VERSION_PATCH@ -#define ROCPROFILER_SOVERSION (10000 * @PROJECT_VERSION_MAJOR@) +#define ROCPROFILER_SOVERSION @PROJECT_VERSION_MAJOR@ #define ROCPROFILER_VERSION_STRING "@FULL_VERSION_STRING@" #define ROCPROFILER_GIT_DESCRIBE "@ROCPROFILER_SDK_GIT_DESCRIBE@" #define ROCPROFILER_GIT_REVISION "@ROCPROFILER_SDK_GIT_REVISION@" diff --git a/projects/rocprofiler-sdk/source/lib/output/output_config.hpp b/projects/rocprofiler-sdk/source/lib/output/output_config.hpp index e27284927e..4ecdbffa52 100644 --- a/projects/rocprofiler-sdk/source/lib/output/output_config.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/output_config.hpp @@ -130,6 +130,7 @@ output_config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(otf2_output); CFG_SERIALIZE_MEMBER(summary_output); CFG_SERIALIZE_MEMBER(kernel_rename); + CFG_SERIALIZE_MEMBER(group_by_queue); #undef CFG_SERIALIZE_MEMBER #undef CFG_SERIALIZE_NAMED_MEMBER diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index 07371d72e0..9cc1292575 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -4,7 +4,7 @@ rocprofiler_activate_clang_tidy() -set(TOOL_HEADERS config.hpp helper.hpp stream_stack.hpp) +set(TOOL_HEADERS config.hpp execution_profile.hpp helper.hpp stream_stack.hpp) set(TOOL_SOURCES config.cpp main.c tool.cpp stream_stack.cpp) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index fdebf3d8bf..e42d5d6ccc 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -319,6 +319,23 @@ config::config() std::stoull(_config_params.at(2))}); } } + + // Benchmarking Enable/Disable + if(!benchmark_mode_env.empty()) + { + const auto valid_options = std::unordered_map{ + {"disabled-sdk-contexts", benchmark::disabled_contexts_overhead}, + {"sdk-buffer-overhead", benchmark::sdk_buffered_overhead}, + {"sdk-callback-overhead", benchmark::sdk_callback_overhead}, + {"tool-runtime-overhead", benchmark::tool_runtime_overhead}, + {"execution-profile", benchmark::execution_profile}, + }; + + ROCP_FATAL_IF(valid_options.count(benchmark_mode_env) == 0) + << fmt::format("Invalid value for ROCPROF_BENCHMARK_MODE: {}", benchmark_mode_env); + + benchmark_mode = valid_options.at(benchmark_mode_env); + } } std::string diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index b66b750e23..57e14475ce 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -87,6 +87,16 @@ struct config : output_config void save(ArchiveT& ar) const; }; + enum class benchmark + { + none = 0, + disabled_contexts_overhead, + sdk_callback_overhead, + sdk_buffered_overhead, + tool_runtime_overhead, + execution_profile, + }; + config(); ~config() = default; @@ -114,11 +124,13 @@ struct config : output_config bool rocjpeg_api_trace = get_env("ROCPROF_ROCJPEG_API_TRACE", false); bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); - bool pc_sampling_host_trap = false; bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false); - bool pc_sampling_stochastic = false; bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false); bool enable_signal_handlers = get_env("ROCPROF_SIGNAL_HANDLERS", true); + bool selected_regions = get_env("ROCPROF_SELECTED_REGIONS", false); + bool output_config_file = get_env("ROCPROF_OUTPUT_CONFIG_FILE", false); + bool pc_sampling_host_trap = false; + bool pc_sampling_stochastic = false; size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1); rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE; rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE; @@ -148,6 +160,9 @@ struct config : output_config uint64_t counter_groups_interval = get_env("ROCPROF_COUNTER_GROUPS_INTERVAL", 1); uint64_t minimum_output_bytes = get_env("ROCPROF_MINIMUM_OUTPUT_BYTES", 0); + std::string benchmark_mode_env = get_env("ROCPROF_BENCHMARK_MODE", ""); + benchmark benchmark_mode = benchmark::none; + template void save(ArchiveT&) const; @@ -180,6 +195,8 @@ template void config::save(ArchiveT& ar) const { + CFG_SERIALIZE_NAMED_MEMBER("benchmark_mode", benchmark_mode_env); + CFG_SERIALIZE_MEMBER(kernel_trace); CFG_SERIALIZE_MEMBER(hsa_core_api_trace); CFG_SERIALIZE_MEMBER(hsa_amd_ext_api_trace); @@ -194,6 +211,7 @@ config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(hip_compiler_api_trace); CFG_SERIALIZE_MEMBER(rccl_api_trace); CFG_SERIALIZE_MEMBER(rocdecode_api_trace); + CFG_SERIALIZE_MEMBER(rocjpeg_api_trace); CFG_SERIALIZE_MEMBER(mpi_rank); CFG_SERIALIZE_MEMBER(mpi_size); @@ -207,7 +225,13 @@ config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(truncate); CFG_SERIALIZE_MEMBER(minimum_output_bytes); CFG_SERIALIZE_MEMBER(enable_signal_handlers); + CFG_SERIALIZE_MEMBER(selected_regions); + CFG_SERIALIZE_MEMBER(counter_groups_random_seed); + CFG_SERIALIZE_MEMBER(counter_groups_interval); + + CFG_SERIALIZE_MEMBER(pc_sampling_host_trap); + CFG_SERIALIZE_MEMBER(pc_sampling_stochastic); CFG_SERIALIZE_MEMBER(pc_sampling_method); CFG_SERIALIZE_MEMBER(pc_sampling_unit); CFG_SERIALIZE_MEMBER(pc_sampling_interval); @@ -222,8 +246,10 @@ config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(att_param_target_cu); CFG_SERIALIZE_MEMBER(att_capability); CFG_SERIALIZE_MEMBER(att_param_perfcounters); + CFG_SERIALIZE_MEMBER(att_param_perf_ctrl); - static_cast(*this).save(ar); + // serialize the base class + static_cast(this)->save(ar); } #undef CFG_SERIALIZE_MEMBER diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/execution_profile.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/execution_profile.hpp new file mode 100644 index 0000000000..d1b014b5d0 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/execution_profile.hpp @@ -0,0 +1,104 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/output/metadata.hpp" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace tool +{ +struct execution_profile_data +{ + using extern_corr_id_request_t = rocprofiler_external_correlation_id_request_kind_t; + using operation_set_t = std::unordered_set; + + std::unordered_map category_count = {}; + std::unordered_map category_op_count = {}; + std::unordered_set threads = {}; + std::unordered_set contexts = {}; +}; + +struct execution_profile_category_data +{ + uint64_t count = 0; // total invocations of a given category + uint64_t unique = 0; // number of unique operations +}; +} // namespace tool +} // namespace rocprofiler + +namespace cereal +{ +template +void +save(ArchiveT& ar, ::rocprofiler::tool::execution_profile_category_data data) +{ + ar(cereal::make_nvp("count", data.count)); + ar(cereal::make_nvp("unique", data.unique)); +} + +template +void +save(ArchiveT& ar, const ::rocprofiler::tool::execution_profile_data& data) +{ + namespace tool = ::rocprofiler::tool; + + using category_count_map_t = std::map; + + auto _category_count = category_count_map_t{}; + for(auto itr : data.category_count) + { + const char* _name = nullptr; + ROCPROFILER_CHECK(rocprofiler_query_external_correlation_id_request_kind_name( + itr.first, &_name, nullptr)); + if(_name) + { + auto _unique_ops = data.category_op_count.at(itr.first).size(); + auto _kind_name = std::string{_name}; + std::for_each( + _kind_name.begin(), _kind_name.end(), [](auto& v) { v = ::std::tolower(v); }); + + _category_count.emplace(_kind_name, + tool::execution_profile_category_data{itr.second, _unique_ops}); + } + } + + ar(cereal::make_nvp("threads", data.threads.size())); + ar(cereal::make_nvp("contexts", data.contexts.size())); + for(auto itr : _category_count) + ar(cereal::make_nvp(itr.first.c_str(), itr.second)); +} +} // namespace cereal diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 9f9b09368b..1cf9fe83d5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -20,10 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "rocprofiler-sdk/defines.h" +#include "rocprofiler-sdk/dispatch_counting_service.h" #define _GNU_SOURCE 1 #define _DEFAULT_SOURCE 1 #include "config.hpp" +#include "execution_profile.hpp" #include "helper.hpp" #include "stream_stack.hpp" @@ -50,6 +53,7 @@ #include "lib/output/output_stream.hpp" #include "lib/output/statistics.hpp" #include "lib/output/stream_info.hpp" +#include "lib/output/timestamps.hpp" #include "lib/output/tmp_file.hpp" #include "lib/output/tmp_file_buffer.hpp" @@ -63,11 +67,13 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -146,14 +152,6 @@ add_destructor(Tp*& ptr) return ptr; } -#define ADD_DESTRUCTOR(PTR) \ - { \ - static auto _once = std::once_flag{}; \ - std::call_once(_once, []() { add_destructor(PTR); }); \ - } - -#undef ADD_DESTRUCTOR - struct chained_siginfo { int signo = 0; @@ -207,6 +205,11 @@ struct buffer_ids rocjpeg_api_trace, pc_sampling_stochastic}; } + auto pc_sampling_buffers_as_array() const + { + return std::array{pc_sampling_host_trap, + pc_sampling_stochastic}; + } }; buffer_ids& @@ -246,10 +249,11 @@ using kernel_iteration_t = std::unordered_map; using kernel_rename_stack_t = std::stack; -auto* tool_metadata = as_pointer(tool::metadata::inprocess{}); -auto target_kernels = common::Synchronized{}; -std::mutex att_shader_data; +auto* tool_metadata = as_pointer(tool::metadata::inprocess{}); +auto target_kernels = common::Synchronized{}; +auto* execution_profile = as_pointer>(); auto counter_collection_ctx = rocprofiler_context_id_t{0}; +std::mutex att_shader_data; thread_local auto thread_dispatch_rename = as_pointer(); thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() { @@ -336,10 +340,12 @@ get_client_ctx() void flush() { + constexpr auto null_buffer_id = rocprofiler_buffer_id_t{.handle = 0}; + ROCP_INFO << "flushing buffers..."; for(auto itr : get_buffers().as_array()) { - if(itr.handle > 0) + if(itr > null_buffer_id) { ROCP_INFO << "flushing buffer " << itr.handle; ROCPROFILER_CALL(rocprofiler_flush_buffer(itr), "buffer flush"); @@ -419,6 +425,32 @@ collection_period_cntrl(std::promise&& _promise, rocprofiler_context_id_t } } +int +record_execution_profile(rocprofiler_thread_id_t thr_id, + rocprofiler_context_id_t ctx_id, + rocprofiler_external_correlation_id_request_kind_t kind, + rocprofiler_tracing_operation_t op, + uint64_t /*internal_corr_id*/, + rocprofiler_user_data_t* /*external_corr_id*/, + void* /*user_data*/) +{ + auto _record_data = [](tool::execution_profile_data& _data, + rocprofiler_thread_id_t _thr_id, + rocprofiler_context_id_t _ctx_id, + rocprofiler_external_correlation_id_request_kind_t _kind, + rocprofiler_tracing_operation_t _op) { + _data.category_count[_kind] += 1; + _data.category_op_count[_kind].emplace(_op); + _data.threads.emplace(_thr_id); + _data.contexts.emplace(_ctx_id); + }; + + if(execution_profile) + execution_profile->wlock(std::move(_record_data), thr_id, ctx_id, kind, op); + + return 0; +} + int set_kernel_rename_and_stream_display_correlation_id( rocprofiler_thread_id_t thr_id, @@ -623,6 +655,27 @@ runtime_initialization_callback(rocprofiler_callback_tracing_record_t record, common::consume_args(user_data, data); } +void +dummy_callback_tracing_callback(rocprofiler_callback_tracing_record_t /*record*/, + rocprofiler_user_data_t* /*user_data*/, + void* /*data*/) +{} + +void +dummy_counter_dispatch_callback(rocprofiler_dispatch_counting_service_data_t, + rocprofiler_profile_config_id_t*, + rocprofiler_user_data_t*, + void*) +{} + +void +dummy_counter_record_callback(rocprofiler_dispatch_counting_service_data_t, + rocprofiler_record_counter_t*, + size_t, + rocprofiler_user_data_t, + void*) +{} + void callback_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -865,6 +918,10 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto* sym_data = static_cast(record.payload); if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) { + ROCP_TRACE << fmt::format("adding kernel symbol info for kernel_id={} :: {}", + sym_data->kernel_id, + sym_data->kernel_name); + auto success = CHECK_NOTNULL(tool_metadata) ->add_kernel_symbol(kernel_symbol_info{ get_dereference(sym_data), @@ -917,6 +974,15 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, (void) data; } +void +dummy_buffered_tracing_callback(rocprofiler_context_id_t /*context*/, + rocprofiler_buffer_id_t /*buffer_id*/, + rocprofiler_record_header_t** /*headers*/, + size_t /*num_headers*/, + void* /*user_data*/, + uint64_t /*drop_count*/) +{} + void buffered_tracing_callback(rocprofiler_context_id_t /*context*/, rocprofiler_buffer_id_t /*buffer_id*/, @@ -1234,12 +1300,12 @@ get_att_perfcounter_params(rocprofiler_agent_id_t agen } void -rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, - rocprofiler_buffer_id_t /* buffer_id*/, - rocprofiler_record_header_t** headers, - size_t num_headers, - void* /*data*/, - uint64_t /* drop_count*/) +pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, + rocprofiler_buffer_id_t /* buffer_id*/, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* /*data*/, + uint64_t /* drop_count*/) { if(!headers) return; @@ -1345,10 +1411,10 @@ att_dispatch_callback(rocprofiler_agent_id_t /* agent_id */, } void -dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, - rocprofiler_counter_config_id_t* config, - rocprofiler_user_data_t* user_data, - void* /*callback_data_args*/) +counter_dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* /*callback_data_args*/) { static auto kernel_iteration = common::Synchronized{}; @@ -1499,9 +1565,10 @@ if_pc_sample_config_match(rocprofiler_agent_id_t agent_id, } void -configure_pc_sampling_on_all_agents(uint64_t buffer_size, - uint64_t buffer_watermark, - void* tool_data) +configure_pc_sampling_on_all_agents(uint64_t buffer_size, + uint64_t buffer_watermark, + void* tool_data, + rocprofiler_buffer_tracing_cb_t pc_sampling_cb) { auto method = tool::get_config().pc_sampling_method_value; auto unit = tool::get_config().pc_sampling_unit_value; @@ -1515,7 +1582,7 @@ configure_pc_sampling_on_all_agents(uint64_t buffer_size, buffer_size, buffer_watermark, ROCPROFILER_BUFFER_POLICY_LOSSLESS, - rocprofiler_pc_sampling_callback, + pc_sampling_cb, tool_data, buffer_id), "buffer creation"); @@ -1544,9 +1611,81 @@ configure_pc_sampling_on_all_agents(uint64_t buffer_size, ROCP_FATAL << "Given PC sampling configuration is not supported on any of the agents"; } +struct real_callbacks_t +{}; + +struct dummy_callbacks_t +{}; + +constexpr auto use_real_callbacks = real_callbacks_t{}; +constexpr auto use_dummy_callbacks = dummy_callbacks_t{}; + +struct tracing_callbacks_t +{ + tracing_callbacks_t() = delete; + + tracing_callbacks_t(real_callbacks_t) + : code_object_tracing{code_object_tracing_callback} + , cntrl_tracing{cntrl_tracing_callback} + , kernel_rename{kernel_rename_callback} + , hip_stream{hip_stream_display_callback} + , callback_tracing{callback_tracing_callback} + , buffered_tracing{buffered_tracing_callback} + , pc_sampling{pc_sampling_callback} + , att_dispatch{att_dispatch_callback} + , att_shader_data{att_shader_data_callback} + , counter_dispatch{counter_dispatch_callback} + , counter_record{counter_record_callback} + {} + + explicit tracing_callbacks_t(dummy_callbacks_t) + : code_object_tracing{dummy_callback_tracing_callback} + , cntrl_tracing{dummy_callback_tracing_callback} + , kernel_rename{dummy_callback_tracing_callback} + , hip_stream{dummy_callback_tracing_callback} + , callback_tracing{dummy_callback_tracing_callback} + , buffered_tracing{dummy_buffered_tracing_callback} + , pc_sampling{dummy_buffered_tracing_callback} + , counter_dispatch{dummy_counter_dispatch_callback} + , counter_record{dummy_counter_record_callback} + {} + + const rocprofiler_callback_tracing_cb_t code_object_tracing = nullptr; + const rocprofiler_callback_tracing_cb_t cntrl_tracing = nullptr; + const rocprofiler_callback_tracing_cb_t kernel_rename = nullptr; + const rocprofiler_callback_tracing_cb_t hip_stream = nullptr; + const rocprofiler_callback_tracing_cb_t callback_tracing = nullptr; + const rocprofiler_buffer_tracing_cb_t buffered_tracing = nullptr; + const rocprofiler_buffer_tracing_cb_t pc_sampling = nullptr; + const rocprofiler_att_dispatch_callback_t att_dispatch = nullptr; + const rocprofiler_att_shader_data_callback_t att_shader_data = nullptr; + const rocprofiler_dispatch_counting_service_cb_t counter_dispatch = nullptr; + const rocprofiler_dispatch_counting_record_cb_t counter_record = nullptr; +}; + +auto +get_tracing_callbacks() +{ + // for the benchmarking modes of sdk buffer/callback overhead, we are measuring the cost + // of the SDK invoking the callbacks to the tool. We do not want to include the overhead + // of the tool doing any work so we use "dummy" callbacks (i.e. functions which just + // immediately return) + if(tool::get_config().benchmark_mode == tool::config::benchmark::sdk_buffered_overhead || + tool::get_config().benchmark_mode == tool::config::benchmark::sdk_callback_overhead || + tool::get_config().benchmark_mode == tool::config::benchmark::execution_profile) + { + return tracing_callbacks_t{use_dummy_callbacks}; + } + + return tracing_callbacks_t{use_real_callbacks}; +} + int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { + static constexpr auto null_context_id = rocprofiler_context_id_t{.handle = 0}; + static constexpr auto null_buffer_id = rocprofiler_buffer_id_t{.handle = 0}; + client_finalizer = fini_func; const uint64_t buffer_size = 16 * common::units::get_page_size(); @@ -1556,18 +1695,39 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "create context failed"); - auto code_obj_ctx = rocprofiler_context_id_t{0}; + auto code_obj_ctx = null_context_id; ROCPROFILER_CALL(rocprofiler_create_context(&code_obj_ctx), "failed to create context"); + auto start_context = [](rocprofiler_context_id_t ctx_id, std::string_view msg) { + using benchmark = tool::config::benchmark; + // do not start context if we are benchmarking the overhead of a service + // being available but unused by any contexts + if(tool::get_config().benchmark_mode != benchmark::disabled_contexts_overhead && + ctx_id != null_context_id) + { + if(tool::get_config().benchmark_mode == benchmark::execution_profile) + { + ROCPROFILER_CHECK(rocprofiler_configure_external_correlation_id_request_service( + ctx_id, nullptr, 0, record_execution_profile, nullptr)); + } + + ROCP_INFO << fmt::format("starting {} context...", msg); + ROCPROFILER_CHECK(rocprofiler_start_context(ctx_id)); + } + }; + + auto callbacks = get_tracing_callbacks(); + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service(code_obj_ctx, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, nullptr, 0, - code_object_tracing_callback, + callbacks.code_object_tracing, nullptr), "code object tracing configure failed"); - ROCPROFILER_CALL(rocprofiler_start_context(code_obj_ctx), "start context failed"); + + start_context(code_obj_ctx, "code object"); if(tool::get_config().marker_api_trace) { @@ -1576,11 +1736,11 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, nullptr, 0, - callback_tracing_callback, + callbacks.callback_tracing, nullptr), "callback tracing service failed to configure"); - auto pause_resume_ctx = rocprofiler_context_id_t{0}; + auto pause_resume_ctx = null_context_id; ROCPROFILER_CALL(rocprofiler_create_context(&pause_resume_ctx), "failed to create context"); ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( @@ -1588,122 +1748,154 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, nullptr, 0, - cntrl_tracing_callback, + callbacks.cntrl_tracing, static_cast(&get_client_ctx())), "callback tracing service failed to configure"); - ROCPROFILER_CALL(rocprofiler_start_context(pause_resume_ctx), "start context failed"); + start_context(pause_resume_ctx, "marker pause/resume"); } - if(tool::get_config().kernel_trace) + struct buffer_service_config { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().kernel_trace), - "buffer creation"); + bool option = false; + rocprofiler_buffer_tracing_kind_t kind = ROCPROFILER_BUFFER_TRACING_NONE; + rocprofiler_buffer_id_t& buffer_id; + }; - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, - nullptr, - 0, - get_buffers().kernel_trace), - "buffer tracing service for kernel dispatch configure"); - } - - if(tool::get_config().memory_copy_trace) + for(auto&& itr : {buffer_service_config{tool::get_config().kernel_trace, + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + get_buffers().kernel_trace}, + buffer_service_config{tool::get_config().memory_copy_trace, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + get_buffers().memory_copy_trace}, + buffer_service_config{tool::get_config().scratch_memory_trace, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + get_buffers().scratch_memory}, + buffer_service_config{tool::get_config().hsa_core_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, + get_buffers().hsa_api_trace}, + buffer_service_config{tool::get_config().hsa_amd_ext_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API, + get_buffers().hsa_api_trace}, + buffer_service_config{tool::get_config().hsa_image_ext_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API, + get_buffers().hsa_api_trace}, + buffer_service_config{tool::get_config().hsa_finalizer_ext_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API, + get_buffers().hsa_api_trace}, + buffer_service_config{tool::get_config().hip_runtime_api_trace, + ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT, + get_buffers().hip_api_trace}, + buffer_service_config{tool::get_config().hip_compiler_api_trace, + ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT, + get_buffers().hip_api_trace}, + buffer_service_config{tool::get_config().rccl_api_trace, + ROCPROFILER_BUFFER_TRACING_RCCL_API, + get_buffers().rccl_api_trace}, + buffer_service_config{tool::get_config().memory_allocation_trace, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + get_buffers().memory_allocation_trace}, + buffer_service_config{tool::get_config().rocdecode_api_trace, + ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT, + get_buffers().rocdecode_api_trace}, + buffer_service_config{tool::get_config().rocjpeg_api_trace, + ROCPROFILER_BUFFER_TRACING_ROCJPEG_API, + get_buffers().rocjpeg_api_trace}}) { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - nullptr, - &get_buffers().memory_copy_trace), - "create memory copy buffer"); - - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - nullptr, - 0, - get_buffers().memory_copy_trace), - "buffer tracing service for memory copy configure"); - } - - if(tool::get_config().memory_allocation_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - nullptr, - &get_buffers().memory_allocation_trace), - "create memory allocation buffer"); - - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, - nullptr, - 0, - get_buffers().memory_allocation_trace), - "buffer tracing service for memory allocation configure"); - } - - if(tool::get_config().scratch_memory_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().scratch_memory), - "buffer creation"); - - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, - nullptr, - 0, - get_buffers().scratch_memory), - "buffer tracing service for scratch memory configure"); - } - - if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace || - tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().hsa_api_trace), - "buffer creation"); - - using optpair_t = std::pair; - for(auto itr : {optpair_t{tool::get_config().hsa_core_api_trace, - ROCPROFILER_BUFFER_TRACING_HSA_CORE_API}, - optpair_t{tool::get_config().hsa_amd_ext_api_trace, - ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API}, - optpair_t{tool::get_config().hsa_image_ext_api_trace, - ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API}, - optpair_t{tool::get_config().hsa_finalizer_ext_api_trace, - ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}}) + if(itr.option) { - if(itr.first) + // in sdk callback overhead benchmarking, we don't want to use the buffer services + if(tool::get_config().benchmark_mode == tool::config::benchmark::sdk_callback_overhead) + continue; + + if(itr.buffer_id == null_buffer_id) { - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), itr.second, nullptr, 0, get_buffers().hsa_api_trace), - "buffer tracing service for hsa api configure"); + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + callbacks.buffered_tracing, + tool_data, + &itr.buffer_id), + "buffer creation"); + + ROCP_FATAL_IF(itr.buffer_id.handle == 0) << "failed to create buffer"; + + auto cb_thread = rocprofiler_callback_thread_t{}; + + ROCP_INFO << "creating dedicated callback thread for buffer " + << itr.buffer_id.handle; + ROCPROFILER_CALL(rocprofiler_create_callback_thread(&cb_thread), + "creating callback thread"); + + ROCP_INFO << "assigning buffer " << itr.buffer_id.handle << " to callback thread " + << cb_thread.handle; + ROCPROFILER_CALL(rocprofiler_assign_callback_thread(itr.buffer_id, cb_thread), + "assigning callback thread"); } + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + get_client_ctx(), itr.kind, nullptr, 0, itr.buffer_id), + "buffer tracing service configure"); + } + } + + struct callback_service_config + { + bool option = false; + rocprofiler_callback_tracing_kind_t kind = ROCPROFILER_CALLBACK_TRACING_NONE; + rocprofiler_callback_tracing_cb_t callback = nullptr; + }; + + for(auto&& itr : {callback_service_config{tool::get_config().kernel_trace, + ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().memory_copy_trace, + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().scratch_memory_trace, + ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hsa_core_api_trace, + ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hsa_amd_ext_api_trace, + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hsa_image_ext_api_trace, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hsa_finalizer_ext_api_trace, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hip_runtime_api_trace, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().hip_compiler_api_trace, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().rccl_api_trace, + ROCPROFILER_CALLBACK_TRACING_RCCL_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().memory_allocation_trace, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().rocdecode_api_trace, + ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API, + dummy_callback_tracing_callback}, + callback_service_config{tool::get_config().rocjpeg_api_trace, + ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API, + dummy_callback_tracing_callback}}) + { + if(itr.option) + { + // in sdk callback overhead benchmarking, we don't want to use the buffer services + if(tool::get_config().benchmark_mode != tool::config::benchmark::sdk_callback_overhead) + continue; + + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + get_client_ctx(), itr.kind, nullptr, 0, itr.callback, nullptr), + "callback tracing service failed to configure"); } } @@ -1748,120 +1940,28 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) id, agent_params.data(), agent_params.size(), - att_dispatch_callback, - att_shader_data_callback, + callbacks.att_dispatch, + callbacks.att_shader_data, tool_data), "thread trace service configure"); } } - if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().hip_api_trace), - "buffer creation"); - - if(tool::get_config().hip_runtime_api_trace) - { - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT, - nullptr, - 0, - get_buffers().hip_api_trace), - "buffer tracing service for hip api configure"); - } - - if(tool::get_config().hip_compiler_api_trace) - { - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT, - nullptr, - 0, - get_buffers().hip_api_trace), - "buffer tracing service for hip compiler api configure"); - } - } - - if(tool::get_config().rccl_api_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().rccl_api_trace), - "buffer creation"); - - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_RCCL_API, - nullptr, - 0, - get_buffers().rccl_api_trace), - "buffer tracing service for rccl api configure"); - } - if(tool::get_config().counter_collection) { ROCPROFILER_CALL(rocprofiler_create_context(&counter_collection_ctx), - "failed to create context"); + "failed to create counter collection context"); ROCPROFILER_CALL( rocprofiler_configure_callback_dispatch_counting_service(counter_collection_ctx, - dispatch_callback, + callbacks.counter_dispatch, nullptr, - counter_record_callback, + callbacks.counter_record, nullptr), "Could not setup counting service"); - ROCPROFILER_CALL(rocprofiler_start_context(counter_collection_ctx), "start context failed"); + + start_context(counter_collection_ctx, "counter collection"); } - if(tool::get_config().rocdecode_api_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().rocdecode_api_trace), - "buffer creation"); - - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT, - nullptr, - 0, - get_buffers().rocdecode_api_trace), - "buffer tracing service for ROCDecode api configure"); - } - - if(tool::get_config().rocjpeg_api_trace) - { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - buffered_tracing_callback, - tool_data, - &get_buffers().rocjpeg_api_trace), - "buffer creation"); - - ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(get_client_ctx(), - ROCPROFILER_BUFFER_TRACING_ROCJPEG_API, - nullptr, - 0, - get_buffers().rocjpeg_api_trace), - "buffer tracing service for ROCDecode api configure"); - } if(tool::get_config().kernel_rename) { auto rename_ctx = rocprofiler_context_id_t{0}; @@ -1877,34 +1977,38 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, marker_core_api_kinds.data(), marker_core_api_kinds.size(), - kernel_rename_callback, + callbacks.kernel_rename, nullptr), "callback tracing service failed to configure"); - ROCPROFILER_CALL(rocprofiler_start_context(rename_ctx), "start context failed"); + start_context(rename_ctx, "kernel rename"); } + if(!tool::get_config().group_by_queue) { // Track stream ID information via callback service auto hip_stream_display_ctx = rocprofiler_context_id_t{0}; ROCPROFILER_CALL(rocprofiler_create_context(&hip_stream_display_ctx), - "failed to create context"); + "failed to create hip stream context"); ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service(hip_stream_display_ctx, ROCPROFILER_CALLBACK_TRACING_HIP_STREAM, nullptr, 0, - hip_stream_display_callback, + callbacks.hip_stream, nullptr), - "stream tracing configure failed"); - ROCPROFILER_CALL(rocprofiler_start_context(hip_stream_display_ctx), "start context failed"); + "hip stream tracing configure failed"); + + start_context(hip_stream_display_ctx, "hip stream"); // Track if HIP runtime has been initialized via runtime_intialization service auto runtime_initialization_ctx = rocprofiler_context_id_t{0}; + ROCPROFILER_CALL(rocprofiler_create_context(&runtime_initialization_ctx), - "failed to create context"); + "failed to create runtime initialization context"); + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( runtime_initialization_ctx, ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION, @@ -1912,11 +2016,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) 0, runtime_initialization_callback, nullptr), - "stream tracing configure failed"); - ROCPROFILER_CALL(rocprofiler_start_context(runtime_initialization_ctx), - "start context failed"); + "runtime initialization tracing configure failed"); + + start_context(runtime_initialization_ctx, "runtime initialization"); } - if(tool::get_config().kernel_rename || !tool::get_config().group_by_queue) + + if((tool::get_config().kernel_rename || !tool::get_config().group_by_queue) && + tool::get_config().benchmark_mode != tool::config::benchmark::execution_profile) { auto external_corr_id_request_kinds = std::array{ @@ -1949,16 +2055,18 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) if(tool::get_config().pc_sampling_host_trap) { - configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data); + configure_pc_sampling_on_all_agents( + buffer_size, buffer_watermark, tool_data, callbacks.pc_sampling); } else if(tool::get_config().pc_sampling_stochastic) { - configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data); + configure_pc_sampling_on_all_agents( + buffer_size, buffer_watermark, tool_data, callbacks.pc_sampling); } - for(auto itr : get_buffers().as_array()) + for(auto itr : get_buffers().pc_sampling_buffers_as_array()) { - if(itr.handle > 0) + if(itr > null_buffer_id) { auto cb_thread = rocprofiler_callback_thread_t{}; @@ -1973,24 +2081,38 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } } - if(tool::get_config().collection_periods.empty()) - { - ROCPROFILER_CHECK(rocprofiler_start_context(get_client_ctx())); - } - else - { - auto _prom = std::promise{}; - auto _fut = _prom.get_future(); - std::thread{collection_period_cntrl, std::move(_prom), get_client_ctx()}.detach(); - _fut.wait_for(std::chrono::seconds{1}); // wait for a max of 1 second - } - // Handle kernel id of zero bool include = std::regex_search("0", std::regex(tool::get_config().kernel_filter_include)); bool exclude = std::regex_search("0", std::regex(tool::get_config().kernel_filter_exclude)); if(include && (!exclude || tool::get_config().kernel_filter_exclude.empty())) add_kernel_target(0, tool::get_config().kernel_filter_range); + if(tool::get_config().benchmark_mode == tool::config::benchmark::disabled_contexts_overhead) + { + ROCP_INFO << "rocprofv3 is not recording data because the overhead of inactive contexts is " + "being benchmarked"; + } + else if(tool::get_config().selected_regions) + { + ROCP_WARNING << "rocprofv3 is only recording profiling data within regions of code " + "surrounded by roctxProfilerResume(0)/roctxProfilerPause"; + } + else if(!tool::get_config().collection_periods.empty()) + { + ROCP_INFO << "rocprofv3 will record data during the defined collection period(s)"; + + auto _prom = std::promise{}; + auto _fut = _prom.get_future(); + std::thread{collection_period_cntrl, std::move(_prom), get_client_ctx()}.detach(); + _fut.wait_for(std::chrono::seconds{1}); // wait for a max of 1 second + } + else + { + ROCP_INFO << "rocprofv3 will record data starting now"; + + start_context(get_client_ctx(), "primary rocprofv3"); + } + tool_metadata->process_id = getpid(); rocprofiler_get_timestamp(&(tool_metadata->process_start_ns)); @@ -2008,6 +2130,90 @@ struct output_data uint64_t num_bytes = 0; }; +void +generate_config_output(const tool::config& cfg, const tool::metadata& tool_metadata_v) +{ + using JSONOutputArchive = ::cereal::PrettyJSONOutputArchive; + + constexpr auto json_prec = 16; + constexpr auto json_indent = JSONOutputArchive::Options::IndentChar::space; + auto json_opts = JSONOutputArchive::Options{json_prec, json_indent, 2}; + auto filename = std::string_view{"config"}; + + auto stream = get_output_stream(cfg, filename, ".json"); + { + auto archive = JSONOutputArchive{*stream.stream, json_opts}; + + archive.setNextName("rocprofiler-sdk-tool"); + archive.startNode(); + archive.makeArray(); + archive.startNode(); // first array entry + + auto timestamps = + tool::timestamps_t{tool_metadata_v.process_start_ns, tool_metadata_v.process_end_ns}; + + auto this_pid = tool_metadata_v.process_id; + + archive.setNextName("metadata"); + archive.startNode(); + archive(cereal::make_nvp("pid", this_pid)); + archive(cereal::make_nvp("init_time", timestamps.app_start_time)); + archive(cereal::make_nvp("fini_time", timestamps.app_end_time)); + archive(cereal::make_nvp("config", cfg)); + archive(cereal::make_nvp("command", common::read_command_line(this_pid))); + + { + archive.setNextName("build_spec"); + archive.startNode(); + archive(cereal::make_nvp("version_major", ROCPROFILER_VERSION_MAJOR)); + archive(cereal::make_nvp("version_minor", ROCPROFILER_VERSION_MINOR)); + archive(cereal::make_nvp("version_patch", ROCPROFILER_VERSION_PATCH)); + archive(cereal::make_nvp("soversion", ROCPROFILER_SOVERSION)); + archive(cereal::make_nvp("compiler_id", std::string{ROCPROFILER_COMPILER_ID})); + archive( + cereal::make_nvp("compiler_version", std::string{ROCPROFILER_COMPILER_VERSION})); + archive(cereal::make_nvp("git_describe", std::string{ROCPROFILER_GIT_DESCRIBE})); + archive(cereal::make_nvp("git_revision", std::string{ROCPROFILER_GIT_REVISION})); + archive(cereal::make_nvp("library_arch", std::string{ROCPROFILER_LIBRARY_ARCH})); + archive(cereal::make_nvp("system_name", std::string{ROCPROFILER_SYSTEM_NAME})); + archive( + cereal::make_nvp("system_processor", std::string{ROCPROFILER_SYSTEM_PROCESSOR})); + archive(cereal::make_nvp("system_version", std::string{ROCPROFILER_SYSTEM_VERSION})); + archive.finishNode(); // build_spec + } + + // save the execution profile + if(execution_profile) archive(cereal::make_nvp("profile", execution_profile->get())); + + // save the environment variables + { + archive.setNextName("environment"); + archive.startNode(); + size_t idx = 0; + while(true) + { + const auto* env_entry = environ[idx++]; + if(!env_entry) + break; + else if(std::string_view{env_entry}.find('=') != std::string_view::npos) + { + auto _entry = std::string{env_entry}; + auto _pos = _entry.find('='); + auto _name = _entry.substr(0, _pos); + auto _value = _entry.substr(_pos + 1); + archive(cereal::make_nvp(_name.c_str(), _value)); + } + } + archive.finishNode(); + } + + archive.finishNode(); // metadata + archive.finishNode(); // first array entry + archive.finishNode(); // rocprofiler-sdk-tool + } + stream.close(); +} + template void generate_output(tool::buffered_output& output_v, @@ -2019,6 +2225,9 @@ generate_output(tool::buffered_output& output_v, if(!output_v) return; + // when benchmarking, we do not generate output + if(tool::get_config().benchmark_mode != tool::config::benchmark::none) return; + // opens temporary file and sets read position to beginning output_v.read(); @@ -2104,6 +2313,12 @@ tool_fini(void* /*tool_data*/) cleanups.clear(); }; + // generate the configuration output regardless of whether there is any data + if(tool::get_config().output_config_file) + { + generate_config_output(tool::get_config(), *tool_metadata); + } + auto _dtor = common::scope_destructor{run_cleanup}; generate_output(kernel_dispatch_output, outdata, contributions, cleanups); @@ -2662,6 +2877,7 @@ rocprofiler_configure(uint32_t version, // ensure these pointers are not leaked add_destructor(tool_metadata); + add_destructor(execution_profile); // in case main wrapper is not used ::atexit([]() { finalize_rocprofv3("atexit"); }); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.cpp index d7461acdf7..46af922fce 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.cpp @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include - +#include "lib/rocprofiler-sdk/external_correlation.hpp" #include "lib/common/synchronized.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" -#include "lib/rocprofiler-sdk/external_correlation.hpp" + +#include +#include #include @@ -36,6 +36,50 @@ namespace external_correlation { namespace { +#define ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(CODE) \ + template <> \ + struct external_correlation_id_request_kind_string< \ + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_##CODE> \ + { \ + static constexpr auto value = \ + std::pair{#CODE, std::string_view{#CODE}.length()}; \ + }; + +template +struct external_correlation_id_request_kind_string; + +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(NONE) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_CORE_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_AMD_EXT_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_IMAGE_EXT_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_FINALIZE_EXT_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HIP_RUNTIME_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HIP_COMPILER_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_CORE_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_CONTROL_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_NAME_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MEMORY_COPY) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MEMORY_ALLOCATION) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(KERNEL_DISPATCH) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(SCRATCH_MEMORY) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(RCCL_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(OMPT) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(ROCDECODE_API) +ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(ROCJPEG_API) + +#undef ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING + +template +std::pair +get_kind_name(rocprofiler_external_correlation_id_request_kind_t kind, + std::index_sequence) +{ + if(kind == Idx) return external_correlation_id_request_kind_string::value; + // recursion until tail empty + if constexpr(sizeof...(Tail) > 0) return get_kind_name(kind, std::index_sequence{}); + return {nullptr, 0}; +} + auto get_default_tid() { @@ -249,6 +293,21 @@ rocprofiler_configure_external_correlation_id_request_service( callback, callback_args, kinds_v); } +rocprofiler_status_t +rocprofiler_query_external_correlation_id_request_kind_name( + rocprofiler_external_correlation_id_request_kind_t kind, + const char** name, + uint64_t* name_len) +{ + auto&& val = rocprofiler::external_correlation::get_kind_name( + kind, std::make_index_sequence{}); + + if(name) *name = val.first; + if(name_len) *name_len = val.second; + + return (val.first) ? ROCPROFILER_STATUS_SUCCESS : ROCPROFILER_STATUS_ERROR_KIND_NOT_FOUND; +} + rocprofiler_status_t rocprofiler_push_external_correlation_id(rocprofiler_context_id_t context, rocprofiler_thread_id_t tid, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.hpp index 9e829b7c24..9b32732b3b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/external_correlation.hpp @@ -42,6 +42,7 @@ namespace context { struct context; } + namespace external_correlation { static constexpr bool enable_const_wlock_v = true; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index 1a10b55a8e..cb2b9dbdad 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -688,6 +688,33 @@ initialize() if(get_num_clients() > 0) internal_threading::initialize(); // initialization is no longer available set_init_status(1); + + if(get_num_clients() > 0) + { + for(const auto& itr : *get_clients()) + { + if(!itr) continue; + size_t _client_registered_ctx = 0; + for(const auto* citr : context::get_registered_contexts()) + { + if(citr->client_idx == itr->internal_client_id.handle) ++_client_registered_ctx; + } + + size_t _client_activated_ctx = 0; + for(const auto* citr : context::get_active_contexts()) + { + if(citr->client_idx == itr->internal_client_id.handle) ++_client_activated_ctx; + } + + ROCP_INFO << fmt::format("rocprofiler-sdk client '{}' registered {} context(s) and " + "started {} context(s)", + (itr->mutable_client_id.name) + ? std::string_view{itr->mutable_client_id.name} + : std::string_view{"unspecified"}, + _client_registered_ctx, + _client_activated_ctx); + } + } }); } diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index f8ddfeac0d..1acd978738 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -15,6 +15,7 @@ find_package(rocJPEG) # applications used by integration tests which DO link to rocprofiler-sdk-roctx add_subdirectory(reproducible-runtime) +add_subdirectory(reproducible-dispatch-count) add_subdirectory(transpose) add_subdirectory(openmp) diff --git a/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/CMakeLists.txt new file mode 100644 index 0000000000..df10dd7e3f --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/CMakeLists.txt @@ -0,0 +1,59 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-tests-bin-reproducible-dispatch-count LANGUAGES CXX HIP) + +if(NOT CMAKE_BUILD_TYPE MATCHES "(Release|RelWithDebInfo)") + set(CMAKE_BUILD_TYPE "RelWithDebInfo") +endif() + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +option(REPRODUCIBLE_DISPATCH_COUNT_USE_MPI + "Enable MPI support in reproducible-dispatch-count exe" OFF) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(reproducible-dispatch-count.cpp PROPERTIES LANGUAGE HIP) +add_executable(reproducible-dispatch-count) +target_sources(reproducible-dispatch-count PRIVATE reproducible-dispatch-count.cpp) +target_compile_options(reproducible-dispatch-count PRIVATE -W -Wall -Wextra -Wpedantic + -Wshadow -Werror) + +find_package(Threads REQUIRED) +target_link_libraries(reproducible-dispatch-count PRIVATE Threads::Threads) + +find_package(rocprofiler-sdk-roctx REQUIRED) +target_link_libraries(reproducible-dispatch-count + PRIVATE rocprofiler-sdk-roctx::rocprofiler-sdk-roctx) + +if(REPRODUCIBLE_DISPATCH_COUNT_USE_MPI) + find_package(MPI REQUIRED) + target_compile_definitions(reproducible-dispatch-count PRIVATE USE_MPI) + target_link_libraries(reproducible-dispatch-count PRIVATE MPI::MPI_C) +endif() diff --git a/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/reproducible-dispatch-count.cpp b/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/reproducible-dispatch-count.cpp new file mode 100644 index 0000000000..55ea5855cf --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/reproducible-dispatch-count/reproducible-dispatch-count.cpp @@ -0,0 +1,254 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "hip/hip_runtime.h" +#include "rocprofiler-sdk-roctx/roctx.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(USE_MPI) +# include +#endif + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error %i : %s\n", \ + __FILE__, \ + __LINE__, \ + static_cast(error_), \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; +size_t niterations = 1000; +uint32_t nspin = 4 * 10000; +size_t nsync = 1; +size_t nthreads = 2; + +void +check_hip_error(void); +} // namespace + +__global__ void +reproducible_dispatch_count(uint32_t nspin); + +void +run(int tid, int devid); + +void +run_nsync(int tid, int devid); + +int +main(int argc, char** argv) +{ + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + fprintf(stderr, + "usage: reproducible-dispatch-count [KERNEL ITERATIONS PER THREAD (default: " + "%zu msec)] [NUM_THREADS (default: %zu)] [SPIN CYCLES PER KERNEL LAUNCH " + "(default: %u)] [ITERATION PER SYNC (default: %zu)\n", + niterations, + nthreads, + nspin, + nsync); + exit(EXIT_SUCCESS); + } + } + + if(argc > 1) niterations = std::stoll(argv[1]); + if(argc > 2) nthreads = std::stoll(argv[2]); + if(argc > 3) nspin = std::stoll(argv[3]); + if(argc > 4) nsync = std::stoll(argv[4]); + + printf("[reproducible-dispatch-count] Kernel dispatches per thread: %zu\n", niterations); + printf("[reproducible-dispatch-count] Spin time per kernel: %u cycles\n", nspin); + printf("[reproducible-dispatch-count] Number of threads: %zu\n", nthreads); + printf("[reproducible-dispatch-count] Iterations per sync: %zu\n", nsync); + + // this is a temporary workaround in omnitrace when HIP + MPI is enabled + int ndevice = 0; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + printf("[reproducible-dispatch-count] Number of devices found: %i\n", ndevice); + auto _threads = std::vector{}; + for(size_t i = 0; i < nthreads; ++i) + { + if(nsync <= 1) + _threads.emplace_back(run, i, i % ndevice); + else + _threads.emplace_back(run_nsync, i, i % ndevice); + } + for(auto& itr : _threads) + itr.join(); + HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipDeviceReset()); + + return 0; +} + +__global__ void +reproducible_dispatch_count(uint32_t nspin_v) +{ + for(uint32_t i = 0; i < nspin_v / 64; i++) + asm volatile("s_sleep 1"); + if(nspin_v > 64) + for(uint32_t i = 0; i < nspin_v % 64; i++) + asm volatile("s_sleep 1"); +} + +void +run(int tid, int devid) +{ + auto roctx_range_id = roctxRangeStart("run"); + + constexpr int min_avail_simd = 128; + dim3 grid(min_avail_simd); + dim3 block(32); + double time = 0.0; + hipStream_t stream = {}; + hipEvent_t start = {}; + hipEvent_t stop = {}; + uint64_t nlaunch = 0; + + HIP_API_CALL(hipSetDevice(devid)); + HIP_API_CALL(hipStreamCreate(&stream)); + HIP_API_CALL(hipEventCreate(&start)); + HIP_API_CALL(hipEventCreate(&stop)); + + for(size_t i = 0; i < niterations; ++i) + { + roctxMark("iteration"); + HIP_API_CALL(hipEventRecord(start, stream)); + reproducible_dispatch_count<<>>(nspin); + HIP_API_CALL(hipEventRecord(stop, stream)); + check_hip_error(); + HIP_API_CALL(hipEventSynchronize(stop)); + float elapsed = 0.0f; + HIP_API_CALL(hipEventElapsedTime(&elapsed, start, stop)); + time += static_cast(elapsed); + ++nlaunch; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipEventDestroy(start)); + HIP_API_CALL(hipEventDestroy(stop)); + + { + auto _msg = std::stringstream{}; + _msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-dispatch-count is " + << std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3) + << (time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n"; + auto_lock_t _lk{print_lock}; + std::cout << _msg.str() << std::flush; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipStreamDestroy(stream)); + + roctxRangeStop(roctx_range_id); +} + +void +run_nsync(int tid, int devid) +{ + auto roctx_range_id = roctxRangeStart("run"); + + constexpr int min_avail_simd = 128; + dim3 grid(min_avail_simd); + dim3 block(32); + hipStream_t stream = {}; + uint64_t nlaunch = 0; + + HIP_API_CALL(hipSetDevice(devid)); + HIP_API_CALL(hipStreamCreate(&stream)); + + auto _elapsed = std::chrono::steady_clock::duration{}; + auto _beg = std::chrono::steady_clock::now(); + for(size_t i = 0; i < niterations; ++i) + { + roctxMark("iteration"); + reproducible_dispatch_count<<>>(nspin); + if((i % nsync) == (nsync - 1)) + { + HIP_API_CALL(hipStreamSynchronize(stream)); + auto _end = std::chrono::steady_clock::now(); + _elapsed += (_end - _beg); + _beg = std::chrono::steady_clock::now(); + } + ++nlaunch; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + auto _end = std::chrono::steady_clock::now(); + _elapsed += (_end - _beg); + + { + auto _time = + std::chrono::duration_cast>(_elapsed).count(); + auto _msg = std::stringstream{}; + _msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-dispatch-count is " + << std::setprecision(2) << std::fixed << _time << " ms (" << std::setprecision(3) + << (_time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n"; + auto_lock_t _lk{print_lock}; + std::cout << _msg.str() << std::flush; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipStreamDestroy(stream)); + + roctxRangeStop(roctx_range_id); +} + +namespace +{ +void +check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + auto_lock_t _lk{print_lock}; + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + throw std::runtime_error("hip_api_call"); + } +} +} // namespace