diff --git a/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml b/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml index 8ed04a7ef0..9bd0c8ed4d 100644 --- a/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml +++ b/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml @@ -23,8 +23,14 @@ env: ROCM_PATH: "/opt/rocm" GPU_TARGETS: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102" PATH: "/usr/bin:$PATH" - TEMP_EXCLUDED_TESTS: "test-page-migration|app-abort" - EXCLUDED_TESTS: ".*pc-sampling.*|.*pc_sampling.*" + navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" + vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" + mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" + mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" + navi3_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$" + vega20_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$" + mi200_EXCLUDE_LABEL_REGEX: "^(openmp-target)$" + mi300_EXCLUDE_LABEL_REGEX: "^(pc-sampling)$" jobs: core-deb: @@ -32,12 +38,12 @@ jobs: strategy: fail-fast: false matrix: - runner: ['vega20-emu', 'mi300-emu'] + runner: ['vega20', 'mi300'] os: ['ubuntu-22.04'] build-type: ['RelWithDebInfo'] ci-flags: ['--linter clang-tidy'] - runs-on: ${{ matrix.runner }}-runner-set + runs-on: ${{ matrix.runner }}-emu-runner-set # define this for containers env: @@ -72,7 +78,6 @@ jobs: if: ${{ contains(matrix.runner, 'mi200') }} shell: bash run: | - echo "EXCLUDED_TESTS=''" >> $GITHUB_ENV echo 'ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1' >> $GITHUB_ENV - name: Configure, Build, and Test @@ -94,8 +99,8 @@ jobs: -DCPACK_PACKAGING_INSTALL_PREFIX="$(realpath /opt/rocm)" -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" - name: Install if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -119,8 +124,8 @@ jobs: export LD_LIBRARY_PATH=/opt/rocprofiler-sdk/lib:${LD_LIBRARY_PATH} cmake --build build-samples --target all --parallel 16 cmake --build build-tests --target all --parallel 16 - ctest --test-dir build-samples -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure - ctest --test-dir build-tests -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure + ctest --test-dir build-samples -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure + ctest --test-dir build-tests -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure - name: Install Packages if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -142,8 +147,8 @@ jobs: CMAKE_PREFIX_PATH=/opt/rocm cmake -B build-tests-deb /opt/rocm/share/rocprofiler-sdk/tests cmake --build build-samples-deb --target all --parallel 16 cmake --build build-tests-deb --target all --parallel 16 - ctest --test-dir build-samples-deb -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure - ctest --test-dir build-tests-deb -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure + ctest --test-dir build-samples-deb -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure + ctest --test-dir build-tests-deb -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure - name: Archive production artifacts if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -160,7 +165,7 @@ jobs: strategy: fail-fast: false matrix: - runner: ['mi300-emu'] + runner: ['mi300'] os: ['rhel-emu', 'sles-emu'] build-type: ['RelWithDebInfo'] ci-flags: ['--linter clang-tidy'] @@ -196,7 +201,6 @@ jobs: if: ${{ contains(matrix.runner, 'mi200') }} shell: bash run: | - echo "EXCLUDED_TESTS=''" >> $GITHUB_ENV echo 'ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1' >> $GITHUB_ENV - name: Configure, Build, and Test @@ -218,8 +222,8 @@ jobs: -DCPACK_PACKAGING_INSTALL_PREFIX="$(realpath /opt/rocm)" -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" - name: Install if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -243,8 +247,8 @@ jobs: export LD_LIBRARY_PATH=/opt/rocprofiler-sdk/lib:${LD_LIBRARY_PATH} cmake --build build-samples --target all --parallel 16 cmake --build build-tests --target all --parallel 16 - ctest --test-dir build-samples -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure - ctest --test-dir build-tests -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure + ctest --test-dir build-samples -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure + ctest --test-dir build-tests -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure - name: Install Packages if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -266,8 +270,8 @@ jobs: CMAKE_PREFIX_PATH=/opt/rocm cmake -B build-tests-deb /opt/rocm/share/rocprofiler-sdk/tests cmake --build build-samples-deb --target all --parallel 16 cmake --build build-tests-deb --target all --parallel 16 - ctest --test-dir build-samples-deb -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure - ctest --test-dir build-tests-deb -LE "${EXCLUDED_TESTS}" -E "${{ env.TEMP_EXCLUDED_TESTS }}" --output-on-failure + ctest --test-dir build-samples-deb -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure + ctest --test-dir build-tests-deb -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" --output-on-failure - name: Archive production artifacts if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} @@ -284,11 +288,11 @@ jobs: strategy: # fail-fast: false matrix: - runner: ['mi200-emu'] + runner: ['mi200'] os: ['ubuntu-22.04'] build-type: ['Release'] - runs-on: ${{ matrix.runner }}-runner-set + runs-on: ${{ matrix.runner }}-emu-runner-set # define this for containers env: @@ -361,7 +365,6 @@ jobs: if: ${{ contains(matrix.runner, 'mi200') }} shell: bash run: | - echo "EXCLUDED_TESTS=''" >> $GITHUB_ENV echo 'ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1' >> $GITHUB_ENV - name: Configure, Build, and Test (Total Code Coverage) @@ -379,8 +382,8 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" - name: Configure, Build, and Test (Tests Code Coverage) timeout-minutes: 30 @@ -398,8 +401,8 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" - name: Configure, Build, and Test (Samples Code Coverage) timeout-minutes: 30 @@ -417,8 +420,8 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" - name: Save XML Code Coverage id: save-coverage @@ -536,7 +539,7 @@ jobs: # - unittests # - integration-tests # - ctest -N -LE 'samples|tests' -E "${{ env.TEMP_EXCLUDED_TESTS }}" -O ctest.mislabeled.log + ctest -N -LE 'samples|tests' -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" -O ctest.mislabeled.log grep 'Total Tests: 0' ctest.mislabeled.log # # if following fails, then there is overlap between the labels. @@ -561,22 +564,22 @@ jobs: strategy: fail-fast: false matrix: - runner: ['vega20-emu', 'navi3-emu', 'mi300-emu'] + runner: ['vega20', 'navi3', 'mi300'] sanitizer: ['AddressSanitizer', 'ThreadSanitizer', 'LeakSanitizer', 'UndefinedBehaviorSanitizer'] os: ['ubuntu-22.04'] build-type: ['RelWithDebInfo'] exclude: - - { runner: 'vega20-emu', sanitizer: 'ThreadSanitizer' } - - { runner: 'vega20-emu', sanitizer: 'AddressSanitizer' } - - { runner: 'vega20-emu', sanitizer: 'UndefinedBehaviorSanitizer' } - - { runner: 'mi300-emu', sanitizer: 'AddressSanitizer' } - - { runner: 'mi300-emu', sanitizer: 'LeakSanitizer' } - - { runner: 'navi3-emu', sanitizer: 'LeakSanitizer' } - - { runner: 'navi3-emu', sanitizer: 'ThreadSanitizer' } - - { runner: 'navi3-emu', sanitizer: 'UndefinedBehaviorSanitizer' } + - { runner: 'vega20', sanitizer: 'ThreadSanitizer' } + - { runner: 'vega20', sanitizer: 'AddressSanitizer' } + - { runner: 'vega20', sanitizer: 'UndefinedBehaviorSanitizer' } + - { runner: 'mi300', sanitizer: 'AddressSanitizer' } + - { runner: 'mi300', sanitizer: 'LeakSanitizer' } + - { runner: 'navi3', sanitizer: 'LeakSanitizer' } + - { runner: 'navi3', sanitizer: 'ThreadSanitizer' } + - { runner: 'navi3', sanitizer: 'UndefinedBehaviorSanitizer' } if: ${{ contains(github.event_name, 'pull_request') }} - runs-on: ${{ matrix.runner }}-runner-set + runs-on: ${{ matrix.runner }}-emu-runner-set # define this for containers env: @@ -611,7 +614,6 @@ jobs: if: ${{ contains(matrix.runner, 'mi200') }} shell: bash run: | - echo "EXCLUDED_TESTS=''" >> $GITHUB_ENV echo 'ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1' >> $GITHUB_ENV - name: Configure, Build, and Test @@ -630,5 +632,5 @@ jobs: -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" -DPython3_EXECUTABLE=$(which python3) -- - -LE "${EXCLUDED_TESTS}" - -E "${{ env.TEMP_EXCLUDED_TESTS }}" + -LE "${${{ matrix.runner }}_EXCLUDE_LABEL_REGEX}" + -E "${${{ matrix.runner }}_EXCLUDE_TESTS_REGEX}" diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_config_install_roctx.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_config_install_roctx.cmake index 62d38c8860..797093df6d 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_config_install_roctx.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_config_install_roctx.cmake @@ -60,7 +60,6 @@ set(${PACKAGE_NAME}_BUILD_TREE set(PROJECT_BUILD_TREE_TARGETS ${SDK_PACKAGE_NAME}::${PACKAGE_NAME}-shared-library ${SDK_PACKAGE_NAME}::${SDK_PACKAGE_NAME}-headers - ${SDK_PACKAGE_NAME}::${SDK_PACKAGE_NAME}-build-flags ${SDK_PACKAGE_NAME}::${SDK_PACKAGE_NAME}-stack-protector) configure_file( diff --git a/projects/rocprofiler-sdk/samples/CMakeLists.txt b/projects/rocprofiler-sdk/samples/CMakeLists.txt index 32ec96b35c..cc61579b8c 100644 --- a/projects/rocprofiler-sdk/samples/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/CMakeLists.txt @@ -33,3 +33,4 @@ add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace) add_subdirectory(external_correlation_id_request) add_subdirectory(pc_sampling) +add_subdirectory(openmp_target) diff --git a/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp b/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp index f48e031520..4c367dfa88 100644 --- a/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp +++ b/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp @@ -189,7 +189,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); - callback_name_info name_info = common::get_callback_id_names(); + callback_name_info name_info = common::get_callback_tracing_names(); for(const auto& itr : name_info) { diff --git a/projects/rocprofiler-sdk/samples/common/name_info.hpp b/projects/rocprofiler-sdk/samples/common/name_info.hpp index 1964bd2624..35793a946e 100644 --- a/projects/rocprofiler-sdk/samples/common/name_info.hpp +++ b/projects/rocprofiler-sdk/samples/common/name_info.hpp @@ -48,7 +48,7 @@ get_buffer_tracing_names() } inline auto -get_callback_id_names() +get_callback_tracing_names() { return rocprofiler::sdk::get_callback_tracing_names(); } diff --git a/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt b/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt new file mode 100644 index 0000000000..ba91806c2b --- /dev/null +++ b/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt @@ -0,0 +1,95 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT OMP_TARGET_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(OMP_TARGET_COMPILER + "${amdclangpp_EXECUTABLE}" + CACHE FILEPATH "") + endif() +endif() + +project(rocprofiler-sdk-samples-openmp-target LANGUAGES CXX) + +find_package(rocprofiler-sdk REQUIRED) + +add_library(openmp-target-sample-client SHARED) +target_sources(openmp-target-sample-client PRIVATE client.cpp client.hpp) +target_link_libraries( + openmp-target-sample-client + PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler-sdk::samples-build-flags + rocprofiler-sdk::samples-common-library) + +set(DEFAULT_GPU_TARGETS + "gfx906" + "gfx908" + "gfx90a" + "gfx940" + "gfx941" + "gfx942" + "gfx1100" + "gfx1101" + "gfx1102") + +set(OPENMP_GPU_TARGETS + "${DEFAULT_GPU_TARGETS}" + CACHE STRING "GPU targets to compile for") + +if(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer") + set(IS_THREAD_SANITIZER ON) +else() + set(IS_THREAD_SANITIZER OFF) +endif() + +find_package(Threads REQUIRED) +find_package(rocprofiler-sdk-roctx REQUIRED) + +add_executable(openmp-target-sample) +target_sources(openmp-target-sample PRIVATE main.cpp) +target_link_libraries( + openmp-target-sample PRIVATE Threads::Threads + rocprofiler-sdk-roctx::rocprofiler-sdk-roctx) +target_compile_options(openmp-target-sample PRIVATE -fopenmp) +target_link_options(openmp-target-sample PRIVATE -fopenmp) + +foreach(_TARGET ${OPENMP_GPU_TARGETS}) + target_compile_options(openmp-target-sample PRIVATE --offload-arch=${_TARGET}) + target_link_options(openmp-target-sample PRIVATE --offload-arch=${_TARGET}) +endforeach() + +include(rocprofiler-sdk-custom-compilation) +rocprofiler_sdk_custom_compilation(TARGET openmp-target-sample + COMPILER ${OMP_TARGET_COMPILER}) + +rocprofiler_samples_get_preload_env(PRELOAD_ENV openmp-target-sample-client) +rocprofiler_samples_get_ld_library_path_env( + LIBRARY_PATH_ENV rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library) + +set(openmp-target-sample-env + ${PRELOAD_ENV} ${LIBRARY_PATH_ENV} "OMP_NUM_THREADS=2" "OMP_TARGET_OFFLOAD=mandatory" + "OMP_DISPLAY_ENV=1" "ROCR_VISIBLE_DEVICES=0") + +add_test(NAME openmp-target-sample COMMAND $) + +set_tests_properties( + openmp-target-sample + PROPERTIES TIMEOUT + 45 + LABELS + "samples;openmp-target" + ENVIRONMENT + "${openmp-target-sample-env}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + "${IS_THREAD_SANITIZER}") diff --git a/projects/rocprofiler-sdk/samples/openmp_target/client.cpp b/projects/rocprofiler-sdk/samples/openmp_target/client.cpp new file mode 100644 index 0000000000..1242f1b358 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/openmp_target/client.cpp @@ -0,0 +1,660 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/ompt/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include "client.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/call_stack.hpp" +#include "common/defines.hpp" +#include "common/filesystem.hpp" +#include "common/name_info.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace client +{ +namespace +{ +using common::call_stack_t; +using common::callback_name_info; +using common::source_location; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = std::unordered_map; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_client_finalize_t client_fini_func = nullptr; +auto client_ctx = rocprofiler_context_id_t{}; +auto cb_name_info = common::get_callback_tracing_names(); +auto bf_name_info = common::get_buffer_tracing_names(); +auto client_buffer = rocprofiler_buffer_id_t{}; +auto client_kernels = kernel_symbol_map_t{}; +auto call_stack_mtx = std::mutex{}; + +auto +get_call_stack_lock() +{ + return std::unique_lock{call_stack_mtx}; +} + +void +print_call_stack(const call_stack_t& _call_stack) +{ + common::print_call_stack("openmp_target_trace.log", _call_stack); +} + +void +tool_tracing_ctrl_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t*, + void* client_data) +{ + auto* ctx = static_cast(client_data); + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER && + record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API && + record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerPause) + { + ROCPROFILER_CALL(rocprofiler_stop_context(*ctx), "pausing client context"); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && + record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API && + record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerResume) + { + ROCPROFILER_CALL(rocprofiler_start_context(*ctx), "resuming client context"); + } +} + +void +tool_callback_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + assert(callback_data != nullptr); + + if(record.kind == ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API) + { + if(record.operation == ROCPROFILER_HSA_CORE_API_ID_hsa_queue_destroy) + { + // skip hsa_queue_destroy for now, it tries to print the queue after it is destroyed + return; + } + } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_LOAD) + { + if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + // flush the buffer to ensure that any lookups for the client kernel names for the code + // object are completed + auto flush_status = rocprofiler_flush_buffer(client_buffer); + if(flush_status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) + ROCPROFILER_CALL(flush_status, "buffer flush"); + } + } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + auto* data = static_cast(record.payload); + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + client_kernels.emplace(data->kernel_id, *data); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + client_kernels.erase(data->kernel_id); + } + } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_OMPT) + { + // demonstrate the use of the ompt_data_t* fields from OMPT + // The client has its own version of those fields as well as an interface to the + // ompt API entry points. + auto* data = static_cast(record.payload); + + if(record.operation == ROCPROFILER_OMPT_ID_parallel_begin) + { + // set the parallel_data value + auto& args = data->args.parallel_begin; + args.parallel_data->value = record.correlation_id.internal; + } + else if(record.operation == ROCPROFILER_OMPT_ID_parallel_end) + { + // set the parallel_data value + auto& args = data->args.parallel_end; + args.parallel_data->value = 0; + } + else if(record.operation == ROCPROFILER_OMPT_ID_thread_begin) + { + // set the thread_data value + auto& args = data->args.thread_begin; + args.thread_data->value = record.thread_id; + } + else if(record.operation == ROCPROFILER_OMPT_ID_thread_end) + { + // set the thread_data value + auto& args = data->args.thread_end; + args.thread_data->value = 0; + } + else if(record.operation == ROCPROFILER_OMPT_ID_implicit_task) + { + auto& args = data->args.implicit_task; + // set the task_data value + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + args.task_data->value = record.correlation_id.internal; + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + args.task_data->value = 0; + else + assert(0); + } + else if(record.operation == ROCPROFILER_OMPT_ID_target_emi) + { + auto& args = data->args.target_emi; + // set the target_data value + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + args.target_data->value = record.correlation_id.internal; + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + args.target_data->value = 0; + else + assert(0); + } + else if(record.operation == ROCPROFILER_OMPT_ID_target_data_op_emi) + { + auto& args = data->args.target_data_op_emi; + // set the host_op_id value + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + args.host_op_id->value = record.correlation_id.internal; + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + args.host_op_id->value = 0; + else + assert(0); + } + else if(record.operation == ROCPROFILER_OMPT_ID_target_submit_emi) + { + // set the host_op_id value + auto& args = data->args.target_submit_emi; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + args.host_op_id->value = record.correlation_id.internal; + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + args.host_op_id->value = 0; + else + assert(0); + } + } + + auto now = std::chrono::steady_clock::now().time_since_epoch().count(); + uint64_t dt = 0; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + user_data->value = now; + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + dt = (now - user_data->value); + + const char* name = nullptr; + rocprofiler_query_callback_tracing_kind_operation_name( + record.kind, record.operation, &name, nullptr); + + auto info = std::stringstream{}; + info << std::left << "tid=" << record.thread_id << ", cid=" << std::setw(3) + << record.correlation_id.internal << ", kind=" << std::setw(2) << record.kind + << ", operation=" << std::setw(3) << record.operation << ", phase=" << record.phase + << ", dt_nsec=" << std::setw(8) << dt << ", name=" << name; + + auto info_data_cb = [](rocprofiler_callback_tracing_kind_t, + int, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& dss = *static_cast(cb_data); + dss << ((arg_num == 0) ? "(" : ", "); + dss << arg_num << ": " << arg_name << "=" << arg_value_str; + (void) arg_value_addr; + (void) arg_type; + (void) indirection_count; + (void) dereference_count; + return 0; + }; + + int32_t max_deref = 1; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + // not for PHASE_NONE + max_deref = 2; + auto info_data = std::stringstream{}; + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) + { + ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kind_operation_args( + record, info_data_cb, max_deref, static_cast(&info_data)), + "Failure iterating trace operation args"); + } + + auto info_data_str = info_data.str(); + if(!info_data_str.empty()) info << " " << info_data_str << ")"; + + auto* call_stack_v = static_cast(callback_data); + auto _lk = get_call_stack_lock(); + call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); +} + +void +tool_buffered_tracing_callback(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count) +{ + assert(drop_count == 0 && "drop count should be zero for lossless policy"); + + auto* call_stack_v = static_cast(user_data); + + for(size_t i = 0; i < num_headers; ++i) + { + auto* header = headers[i]; + + if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) + { + auto* record = + static_cast(header->payload); + + auto info = std::stringstream{}; + + auto dt = (record->end_timestamp - record->start_timestamp); + info << std::left << "tid=" << record->thread_id << ", cid=" << std::setw(3) + << record->correlation_id.internal << ", kind=" << std::setw(2) << record->kind + << ", operation=" << std::setw(3) << record->operation << ", phase= " + << ", dt_nsec=" << std::setw(8) << dt + << ", agent_id=" << record->dispatch_info.agent_id.handle + << ", queue_id=" << record->dispatch_info.queue_id.handle + << ", kernel_id=" << record->dispatch_info.kernel_id + << ", kernel=" << client_kernels.at(record->dispatch_info.kernel_id).kernel_name + << ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp + << ", private_segment_size=" << record->dispatch_info.private_segment_size + << ", group_segment_size=" << record->dispatch_info.group_segment_size + << ", workgroup_size=(" << record->dispatch_info.workgroup_size.x << "," + << record->dispatch_info.workgroup_size.y << "," + << record->dispatch_info.workgroup_size.z << "), grid_size=(" + << record->dispatch_info.grid_size.x << "," << record->dispatch_info.grid_size.y + << "," << record->dispatch_info.grid_size.z << ")"; + + if(record->start_timestamp > record->end_timestamp) + throw std::runtime_error("kernel dispatch: start > end"); + + auto _lk = get_call_stack_lock(); + call_stack_v->emplace_back( + source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) + { + auto* record = + static_cast(header->payload); + + auto info = std::stringstream{}; + + auto dt = (record->end_timestamp - record->start_timestamp); + info << std::left << "tid=" << record->thread_id << ", cid=" << std::setw(3) + << record->correlation_id.internal << ", kind=" << std::setw(2) << record->kind + << ", operation=" << std::setw(3) << record->operation << ", phase= " + << ", dt_nsec=" << std::setw(8) << dt + << ", src_agent_id=" << record->src_agent_id.handle + << ", dst_agent_id=" << record->dst_agent_id.handle + << ", direction=" << record->operation << ", start=" << record->start_timestamp + << ", stop=" << record->end_timestamp + << ", name=" << bf_name_info.at(record->kind, record->operation); + + if(record->start_timestamp > record->end_timestamp) + throw std::runtime_error("memory copy: start > end"); + + auto _lk = get_call_stack_lock(); + call_stack_v->emplace_back( + source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) + { + auto* record = + static_cast(header->payload); + + auto info = std::stringstream{}; + + auto _elapsed = + std::chrono::duration_cast>( + std::chrono::nanoseconds{record->end_timestamp - record->start_timestamp}) + .count(); + + auto dt = (record->end_timestamp - record->start_timestamp); + info << std::left << "tid=" << record->thread_id << ", cid=" << std::setw(3) + << record->correlation_id.internal << ", kind=" << std::setw(2) << record->kind + << ", operation=" << std::setw(3) << record->operation << ", phase= " + << ", dt_nsec=" << std::setw(8) << dt << ", agent_id=" << record->agent_id.handle + << ", queue_id=" << record->queue_id.handle << ", thread_id=" << record->thread_id + << ", elapsed=" << std::setprecision(3) << std::fixed << _elapsed + << " usec, flags=" << record->flags + << ", name=" << bf_name_info.at(record->kind, record->operation); + + auto _lk = get_call_stack_lock(); + call_stack_v->emplace_back( + source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + else + { + auto _msg = std::stringstream{}; + _msg << "unexpected rocprofiler_record_header_t category + kind: (" << header->category + << " + " << header->kind << ")"; + throw std::runtime_error{_msg.str()}; + } + } + + (void) context; + (void) buffer_id; +} + +void +tool_control_init(rocprofiler_context_id_t& primary_ctx) +{ + // Create a specialized (throw-away) context for handling ROCTx profiler pause and resume. + // A separate context is used because if the context that is associated with roctxProfilerPause + // disabled that same context, a call to roctxProfilerResume would be ignored because the + // context that enables the callback for that API call is disabled. + auto cntrl_ctx = rocprofiler_context_id_t{}; + ROCPROFILER_CALL(rocprofiler_create_context(&cntrl_ctx), "control context creation failed"); + + // enable callback marker tracing with only the pause/resume operations + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + cntrl_ctx, + ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, + nullptr, + 0, + tool_tracing_ctrl_callback, + &primary_ctx), + "callback tracing service failed to configure"); + + // start the context so that it is always active + ROCPROFILER_CALL(rocprofiler_start_context(cntrl_ctx), "start of control context"); +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + assert(tool_data != nullptr); + + auto* call_stack_v = static_cast(tool_data); + + call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); + + for(const auto& itr : cb_name_info) + { + auto name_idx = std::stringstream{}; + name_idx << " [" << std::setw(3) << itr.value << "]"; + call_stack_v->emplace_back( + source_location{"rocprofiler_callback_tracing_kind_names " + name_idx.str(), + __FILE__, + __LINE__, + std::string{itr.name}}); + + for(auto [didx, ditr] : itr.items()) + { + auto operation_idx = std::stringstream{}; + operation_idx << " [" << std::setw(3) << didx << "]"; + call_stack_v->emplace_back(source_location{ + "rocprofiler_callback_tracing_kind_operation_names" + operation_idx.str(), + __FILE__, + __LINE__, + std::string{"- "} + std::string{*ditr}}); + } + } + + for(const auto& itr : bf_name_info) + { + auto name_idx = std::stringstream{}; + name_idx << " [" << std::setw(3) << itr.value << "]"; + call_stack_v->emplace_back( + source_location{"rocprofiler_buffer_tracing_kind_names " + name_idx.str(), + __FILE__, + __LINE__, + std::string{itr.name}}); + + for(auto [didx, ditr] : itr.items()) + { + auto operation_idx = std::stringstream{}; + operation_idx << " [" << std::setw(3) << didx << "]"; + call_stack_v->emplace_back(source_location{ + "rocprofiler_buffer_tracing_kind_operation_names" + operation_idx.str(), + __FILE__, + __LINE__, + std::string{"- "} + std::string{*ditr}}); + } + } + + client_fini_func = fini_func; + + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation failed"); + + // enable the control + tool_control_init(client_ctx); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_callback_tracing_callback, + tool_data), + "callback tracing service failed to configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_OMPT, + nullptr, + 0, + tool_callback_tracing_callback, + tool_data), + "callback tracing service failed to configure") + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + nullptr, + 0, + tool_callback_tracing_callback, + tool_data), + "callback tracing service failed to configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, + nullptr, + 0, + tool_callback_tracing_callback, + tool_data), + "callback tracing service failed to configure"); + + constexpr auto buffer_size_bytes = 4096; + constexpr auto buffer_watermark_bytes = buffer_size_bytes - (buffer_size_bytes / 8); + + ROCPROFILER_CALL(rocprofiler_create_buffer(client_ctx, + buffer_size_bytes, + buffer_watermark_bytes, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_buffered_tracing_callback, + tool_data, + &client_buffer), + "buffer creation"); + + for(auto itr : {ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY}) + { + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + client_ctx, itr, nullptr, 0, client_buffer), + "buffer tracing service configure"); + } + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "failure checking context validity"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "rocprofiler context start failed"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data != nullptr); + + auto* _call_stack = static_cast(tool_data); + auto _lk = get_call_stack_lock(); + _call_stack->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); + + print_call_stack(*_call_stack); + + delete _call_stack; +} +} // namespace + +void +setup() +{} + +void +shutdown() +{ + if(client_id) client_fini_func(*client_id); +} + +void +start() +{ + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "rocprofiler context start failed"); +} + +void +stop() +{ + int status = 0; + ROCPROFILER_CALL(rocprofiler_is_initialized(&status), "failed to retrieve init status"); + if(status != 0) + { + ROCPROFILER_CALL(rocprofiler_stop_context(client_ctx), "rocprofiler context stop failed"); + } +} +} // namespace client + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // set the client name + id->name = "ExampleTool"; + + // store client info + client::client_id = id; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "." + << minor << "." << patch << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + // demonstration of alternative way to get the version info + { + auto version_info = std::array{}; + ROCPROFILER_CALL( + rocprofiler_get_version(&version_info.at(0), &version_info.at(1), &version_info.at(2)), + "failed to get version info"); + + if(std::array{major, minor, patch} != version_info) + { + throw std::runtime_error{"version info mismatch"}; + } + } + + // data passed around all the callbacks + auto* client_tool_data = new std::vector{}; + + // add first entry + client_tool_data->emplace_back( + client::source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &client::tool_init, + &client::tool_fini, + static_cast(client_tool_data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/projects/rocprofiler-sdk/samples/openmp_target/client.hpp b/projects/rocprofiler-sdk/samples/openmp_target/client.hpp new file mode 100644 index 0000000000..5fcb735069 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/openmp_target/client.hpp @@ -0,0 +1,44 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#ifdef openmp_target_sample_client_EXPORTS +# define CLIENT_API __attribute__((visibility("default"))) +#else +# define CLIENT_API +#endif + +namespace client +{ +void +setup() CLIENT_API; + +void +shutdown() CLIENT_API; + +void +start() CLIENT_API; + +void +stop() CLIENT_API; +} // namespace client diff --git a/projects/rocprofiler-sdk/samples/openmp_target/main.cpp b/projects/rocprofiler-sdk/samples/openmp_target/main.cpp new file mode 100644 index 0000000000..be04bd0e75 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/openmp_target/main.cpp @@ -0,0 +1,161 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +// #include "client.hpp" + +#include + +#include +#include + +constexpr float EPS_FLOAT = 1.0e-7f; +constexpr double EPS_DOUBLE = 1.0e-15; + +#pragma omp declare target +template +T +mul(T a, T b) +{ + T c; + c = a * b; + return c; +} +#pragma omp end declare target + +template +void +vmul(T* a, T* b, T* c, int N) +{ +#pragma omp target map(to : a [0:N], b [0:N]) map(from : c [0:N]) +#pragma omp teams distribute parallel for + for(int i = 0; i < N; i++) + { + c[i] = mul(a[i], b[i]); + } +} + +int +main() +{ + // client::setup(); + auto range_id = roctxRangeStart("main"); + + constexpr int N = 100000; + int a_i[N], b_i[N], c_i[N], validate_i[N]; + float a_f[N], b_f[N], c_f[N], validate_f[N]; + double a_d[N], b_d[N], c_d[N], validate_d[N]; + int N_errors = 0; + bool flag = false; + + roctxMark("initialization"); + +#pragma omp parallel for + for(int i = 0; i < N; ++i) + { + a_f[i] = a_i[i] = i + 1; + b_f[i] = b_i[i] = i + 2; + a_d[i] = a_i[i]; + b_d[i] = b_i[i]; + validate_i[i] = a_i[i] * b_i[i]; + validate_f[i] = a_f[i] * b_f[i]; + validate_d[i] = a_d[i] * b_d[i]; + } + + vmul(a_i, b_i, c_i, N); + vmul(a_f, b_f, c_f, N); + + auto tid = roctx_thread_id_t{}; + // get the thread id recognized by rocprofiler-sdk from roctx + roctxGetThreadId(&tid); + // pause API tracing + roctxProfilerPause(tid); + + // we don't expect to see the third vmul + vmul(a_d, b_d, c_d, N); + + // resume API tracing + roctxProfilerResume(tid); + + for(int i = 0; i < N; i++) + { + if(c_i[i] != validate_i[i]) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf( + "First fail: c_i[%d](%d) != validate_i[%d](%d)\n", i, c_i[i], i, validate_i[i]); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_f[i] - validate_f[i]) > EPS_FLOAT) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf("First fail: c_f[%d](%f) != validate_f[%d](%f)\n", + i, + static_cast(c_f[i]), + i, + static_cast(validate_f[i])); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_d[i] - validate_d[i]) > EPS_DOUBLE) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf( + "First fail: c_d[%d](%f) != validate_d[%d](%f)\n", i, c_d[i], i, validate_d[i]); + flag = true; + } + } + } + if(N_errors == 0) + { + printf("Success\n"); + return 0; + } + else + { + printf("Total %d failures\n", N_errors); + printf("Fail\n"); + return 1; + } + + roctxRangeStop(range_id); + + // client::stop(); + // client::shutdown(); +} diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt index 1936f6dde3..45431b8198 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt @@ -27,7 +27,6 @@ set(ROCPROFILER_HEADER_FILES internal_threading.h marker.h ompt.h - openmp.h pc_sampling.h profile_config.h registration.h @@ -43,7 +42,7 @@ install( add_subdirectory(hip) add_subdirectory(hsa) add_subdirectory(marker) -add_subdirectory(openmp) +add_subdirectory(ompt) add_subdirectory(rccl) add_subdirectory(cxx) add_subdirectory(kfd) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h index 7be1c33756..85557bfc62 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h @@ -84,7 +84,7 @@ typedef struct } rocprofiler_buffer_tracing_hip_api_record_t; /** - * @brief Additional trace data for OpenMP target routines + * @brief Additional trace data for OMPT target routines */ typedef struct rocprofiler_buffer_tracing_ompt_target_t @@ -98,13 +98,13 @@ typedef struct rocprofiler_buffer_tracing_ompt_target_t typedef struct rocprofiler_buffer_tracing_ompt_target_data_op_t { - uint64_t host_op_id; // from the host_op_id argument to the OMPT callback - int32_t optype; // ompt_target_data_op_t kind of operation - int32_t src_device_num; // ompt device number for data source - int32_t dest_device_num; // ompt device number for data destination - int32_t reserved; // for padding - uint64_t bytes; // size in bytes of the operation - const void* codeptr_ra; // pointer to the callsite of the target_data_op + uint64_t host_op_id; // from the host_op_id argument to the OMPT callback + int32_t optype; // ompt_target_data_op_t kind of operation + int32_t src_device_num; // ompt device number for data source + int32_t dst_device_num; // ompt device number for data destination + int32_t reserved; // for padding + uint64_t bytes; // size in bytes of the operation + const void* codeptr_ra; // pointer to the callsite of the target_data_op } rocprofiler_buffer_tracing_ompt_target_data_op_t; typedef struct rocprofiler_buffer_tracing_ompt_target_kernel_t @@ -115,9 +115,9 @@ typedef struct rocprofiler_buffer_tracing_ompt_target_kernel_t } rocprofiler_buffer_tracing_ompt_target_kernel_t; /** - * @brief ROCProfiler Buffer OPENMP API Tracer Record. + * @brief ROCProfiler Buffer OMPT Tracer Record. */ -typedef struct rocprofiler_buffer_tracing_ompt_api_record_t +typedef struct rocprofiler_buffer_tracing_ompt_record_t { uint64_t size; ///< size of this struct rocprofiler_buffer_tracing_kind_t kind; @@ -129,16 +129,16 @@ typedef struct rocprofiler_buffer_tracing_ompt_api_record_t union { rocprofiler_buffer_tracing_ompt_target_t target; - rocprofiler_buffer_tracing_ompt_target_data_op_t target_data; - rocprofiler_buffer_tracing_ompt_target_kernel_t kernel; + rocprofiler_buffer_tracing_ompt_target_data_op_t target_data_op; + rocprofiler_buffer_tracing_ompt_target_kernel_t target_kernel; uint64_t reserved[5]; }; /// @var kind - /// @brief ::ROCPROFILER_CALLBACK_TRACING_OPENMP + /// @brief ::ROCPROFILER_BUFFER_TRACING_OMPT /// @var operation - /// @brief Specification of the API function,::rocprofiler_ompt_operation_t -} rocprofiler_buffer_tracing_ompt_api_record_t; + /// @brief Specification of the ::rocprofiler_ompt_operation_t +} rocprofiler_buffer_tracing_ompt_record_t; /** * @brief ROCProfiler Buffer Marker Tracer Record. diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h index a3f282f140..ecd74a94af 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include @@ -80,13 +80,13 @@ typedef struct } rocprofiler_callback_tracing_hip_api_data_t; /** - * @brief ROCProfiler OPENMP Callback Data + * @brief ROCProfiler OMPT Callback Data */ typedef struct { - uint64_t size; ///< size of this struct - rocprofiler_ompt_api_args_t args; -} rocprofiler_callback_tracing_ompt_api_data_t; + uint64_t size; ///< size of this struct + rocprofiler_ompt_args_t args; +} rocprofiler_callback_tracing_ompt_data_t; /** * @brief ROCProfiler Marker Tracer Callback Data. diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp index 3a620a4667..7832d62ed9 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -78,6 +78,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, hsa_api, "HSA API function") ROCPROFILER_DEFINE_CATEGORY(category, hip_api, "HIP API function") ROCPROFILER_DEFINE_CATEGORY(category, marker_api, "Marker API region") ROCPROFILER_DEFINE_CATEGORY(category, rccl_api, "RCCL API function") +ROCPROFILER_DEFINE_CATEGORY(category, openmp, "OpenMP") ROCPROFILER_DEFINE_CATEGORY(category, kernel_dispatch, "GPU kernel dispatch") ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy") ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation") @@ -87,6 +88,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation") ROCPROFILER_PERFETTO_CATEGORY(category::hip_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::marker_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::rccl_api), \ + ROCPROFILER_PERFETTO_CATEGORY(category::openmp), \ ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \ ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \ ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp index ad318d546b..216e4a3efa 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -358,6 +358,14 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_rccl_api_data_t data) ROCP_SDK_SAVE_DATA_FIELD(retval); } +template +void +save(ArchiveT& ar, rocprofiler_callback_tracing_ompt_data_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + // ROCP_SDK_SAVE_DATA_FIELD(args); +} + template void save(ArchiveT& ar, rocprofiler_dispatch_counting_service_data_t data) @@ -443,6 +451,56 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_rccl_api_record_t data) save_buffer_tracing_api_record(ar, data); } +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_ompt_target_t data) +{ + ROCP_SDK_SAVE_DATA_VALUE("kind", kind); + ROCP_SDK_SAVE_DATA_VALUE("device", device_num); + ROCP_SDK_SAVE_DATA_VALUE("task_id", task_id); + ROCP_SDK_SAVE_DATA_VALUE("target_id", target_id); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_ompt_target_data_op_t data) +{ + ROCP_SDK_SAVE_DATA_VALUE("host_op_id", host_op_id); + ROCP_SDK_SAVE_DATA_VALUE("optype", optype); + ROCP_SDK_SAVE_DATA_VALUE("src_device_num", src_device_num); + ROCP_SDK_SAVE_DATA_VALUE("dst_device_num", dst_device_num); + ROCP_SDK_SAVE_DATA_VALUE("bytes", bytes); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_ompt_target_kernel_t data) +{ + ROCP_SDK_SAVE_DATA_VALUE("host_op_id", host_op_id); + ROCP_SDK_SAVE_DATA_VALUE("device_num", device_num); + ROCP_SDK_SAVE_DATA_VALUE("requested_num_teams", requested_num_teams); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_ompt_record_t data) +{ + save_buffer_tracing_api_record(ar, data); + + if(data.operation == ROCPROFILER_OMPT_ID_target_emi) + { + ROCP_SDK_SAVE_DATA_FIELD(target); + } + else if(data.operation == ROCPROFILER_OMPT_ID_target_data_op_emi) + { + ROCP_SDK_SAVE_DATA_FIELD(target_data_op); + } + else if(data.operation == ROCPROFILER_OMPT_ID_target_submit_emi) + { + ROCP_SDK_SAVE_DATA_FIELD(target_kernel); + } +} + template void save(ArchiveT& ar, rocprofiler_buffer_tracing_kernel_dispatch_record_t data) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h index 9602e25b76..14203a089e 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/external_correlation.h @@ -67,7 +67,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_SCRATCH_MEMORY, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_RCCL_API, ///< - ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_OPENMP, ///< + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_OMPT, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST, } rocprofiler_external_correlation_id_request_kind_t; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 103dacd4b2..09fb8237ff 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -170,7 +170,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing - ROCPROFILER_CALLBACK_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t + ROCPROFILER_CALLBACK_TRACING_OMPT, ///< @see ::rocprofiler_ompt_operation_t ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, ///< @see ///< ::rocprofiler_memory_allocation_operation_t ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION, ///< Callback notifying that a runtime @@ -200,7 +200,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, ///< Buffer scratch memory reclaimation info ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use ROCPROFILER_BUFFER_TRACING_RCCL_API, ///< RCCL tracing - ROCPROFILER_BUFFER_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t + ROCPROFILER_BUFFER_TRACING_OMPT, ///< @see ::rocprofiler_ompt_operation_t ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, ///< @see ///< ::rocprofiler_memory_allocation_operation_t ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION, ///< Record indicating a runtime library has diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt.h index 637fe57d07..ccd3208d1e 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt.h @@ -24,13 +24,14 @@ #include #include -#include -#include +#include +#include +#include /** * @defgroup OMPT_REGISTRATION Tool registration for OpenMP Tools * - * Functions for enabling OpenMP support in tools which provide their own ompt_start_tool symbol but + * Functions for enabling OMPT support in tools which provide their own ompt_start_tool symbol but * want to defer to rocprofiler-sdk for OMPT. * * @{ @@ -38,9 +39,6 @@ ROCPROFILER_EXTERN_C_INIT -void -rocprofiler_ompt_finalize_tool() ROCPROFILER_API; - rocprofiler_status_t rocprofiler_ompt_is_initialized(int* status) ROCPROFILER_API ROCPROFILER_NONNULL(1); diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/CMakeLists.txt new file mode 100644 index 0000000000..af38708877 --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# +# Installation of public OMPT headers +# +# +set(ROCPROFILER_OMPT_HEADER_FILES api_args.h api_id.h omp-tools.h) + +install( + FILES ${ROCPROFILER_OMPT_HEADER_FILES} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/ompt + COMPONENT development) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_args.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_args.h similarity index 93% rename from projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_args.h rename to projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_args.h index b08a8c17a6..333b1e8c6c 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_args.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_args.h @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -60,7 +60,7 @@ typedef struct rocprofiler_ompt_no_args char empty; } rocprofiler_ompt_no_args; -typedef union rocprofiler_ompt_api_args_t +typedef union rocprofiler_ompt_args_t { // The ompt_data_t* values passed to the client tool are proxies. // This allows the client tool to use them as it would in their own @@ -150,11 +150,11 @@ typedef union rocprofiler_ompt_api_args_t uint64_t module_id; } device_load; - struct - { - int device_num; - uint64_t module_id; - } device_unload; + // struct + // { + // int device_num; + // uint64_t module_id; + // } device_unload; struct { @@ -299,10 +299,10 @@ typedef union rocprofiler_ompt_api_args_t ompt_data_t* target_data; ompt_data_t* host_op_id; ompt_target_data_op_t optype; - void* src_addr; + void* src_address; int src_device_num; - void* dest_addr; - int dest_device_num; + void* dst_address; + int dst_device_num; size_t bytes; const void* codeptr_ra; } target_data_op_emi; @@ -315,15 +315,15 @@ typedef union rocprofiler_ompt_api_args_t unsigned int requested_num_teams; } target_submit_emi; - struct - { - unsigned int nitems; - void** host_addr; - void** device_addr; - size_t* bytes; - unsigned int* mapping_flags; - const void* codeptr_ra; - } target_map_emi; + // struct + // { + // unsigned int nitems; + // void** host_addr; + // void** device_addr; + // size_t* bytes; + // unsigned int* mapping_flags; + // const void* codeptr_ra; + // } target_map_emi; struct { @@ -335,6 +335,6 @@ typedef union rocprofiler_ompt_api_args_t rocprofiler_ompt_callback_functions_t callback_functions; -} rocprofiler_ompt_api_args_t; +} rocprofiler_ompt_args_t; ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_id.h new file mode 100644 index 0000000000..4a15202b36 --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/api_id.h @@ -0,0 +1,67 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +/** + * @brief ROCProfiler enumeration of OMPT (OpenMP tools) tracing operations + * NOTE: These are callbacks into the ROCProfiler SDK from the vendor-provided OMPT implementation + */ +typedef enum // NOLINT(performance-enum-size) +{ + ROCPROFILER_OMPT_ID_NONE = -1, + ROCPROFILER_OMPT_ID_thread_begin = 0, + ROCPROFILER_OMPT_ID_thread_end, + ROCPROFILER_OMPT_ID_parallel_begin, + ROCPROFILER_OMPT_ID_parallel_end, + ROCPROFILER_OMPT_ID_task_create, + ROCPROFILER_OMPT_ID_task_schedule, + ROCPROFILER_OMPT_ID_implicit_task, + ROCPROFILER_OMPT_ID_device_initialize, + ROCPROFILER_OMPT_ID_device_finalize, + ROCPROFILER_OMPT_ID_device_load, + // ROCPROFILER_OMPT_ID_device_unload, + ROCPROFILER_OMPT_ID_sync_region_wait, + ROCPROFILER_OMPT_ID_mutex_released, + ROCPROFILER_OMPT_ID_dependences, + ROCPROFILER_OMPT_ID_task_dependence, + ROCPROFILER_OMPT_ID_work, + ROCPROFILER_OMPT_ID_masked, + ROCPROFILER_OMPT_ID_sync_region, + ROCPROFILER_OMPT_ID_lock_init, + ROCPROFILER_OMPT_ID_lock_destroy, + ROCPROFILER_OMPT_ID_mutex_acquire, + ROCPROFILER_OMPT_ID_mutex_acquired, + ROCPROFILER_OMPT_ID_nest_lock, + ROCPROFILER_OMPT_ID_flush, + ROCPROFILER_OMPT_ID_cancel, + ROCPROFILER_OMPT_ID_reduction, + ROCPROFILER_OMPT_ID_dispatch, + ROCPROFILER_OMPT_ID_target_emi, + ROCPROFILER_OMPT_ID_target_data_op_emi, + ROCPROFILER_OMPT_ID_target_submit_emi, + // ROCPROFILER_OMPT_ID_target_map_emi, + ROCPROFILER_OMPT_ID_error, + ROCPROFILER_OMPT_ID_callback_functions, // fake to return struct of ompt callback function + // pointers + ROCPROFILER_OMPT_ID_LAST +} rocprofiler_ompt_operation_t; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/omp-tools.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/omp-tools.h similarity index 100% rename from projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/omp-tools.h rename to projects/rocprofiler-sdk/source/include/rocprofiler-sdk/ompt/omp-tools.h diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/CMakeLists.txt deleted file mode 100644 index a3263d77e8..0000000000 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# -# -# Installation of public OpenMP headers -# -# -set(ROCPROFILER_OPENMP_HEADER_FILES api_args.h api_id.h omp-tools.h) - -install( - FILES ${ROCPROFILER_OPENMP_HEADER_FILES} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/openmp - COMPONENT development) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_id.h deleted file mode 100644 index afb0283e0b..0000000000 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp/api_id.h +++ /dev/null @@ -1,67 +0,0 @@ -// MIT License -// -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#pragma once - -/** - * @brief ROCProfiler enumeration of OPENMP (OpenMP tools) tracing operations - * NOTE: These are callbacks into the ROCProfiler SDK from the vendor-provided OMPT implementation - */ -typedef enum // NOLINT(performance-enum-size) -{ - ROCPROFILER_OPENMP_ID_NONE = -1, - ROCPROFILER_OPENMP_ID_thread_begin = 0, - ROCPROFILER_OPENMP_ID_thread_end, - ROCPROFILER_OPENMP_ID_parallel_begin, - ROCPROFILER_OPENMP_ID_parallel_end, - ROCPROFILER_OPENMP_ID_task_create, - ROCPROFILER_OPENMP_ID_task_schedule, - ROCPROFILER_OPENMP_ID_implicit_task, - ROCPROFILER_OPENMP_ID_device_initialize, - ROCPROFILER_OPENMP_ID_device_finalize, - ROCPROFILER_OPENMP_ID_device_load, - ROCPROFILER_OPENMP_ID_device_unload, - ROCPROFILER_OPENMP_ID_sync_region_wait, - ROCPROFILER_OPENMP_ID_mutex_released, - ROCPROFILER_OPENMP_ID_dependences, - ROCPROFILER_OPENMP_ID_task_dependence, - ROCPROFILER_OPENMP_ID_work, - ROCPROFILER_OPENMP_ID_masked, - ROCPROFILER_OPENMP_ID_sync_region, - ROCPROFILER_OPENMP_ID_lock_init, - ROCPROFILER_OPENMP_ID_lock_destroy, - ROCPROFILER_OPENMP_ID_mutex_acquire, - ROCPROFILER_OPENMP_ID_mutex_acquired, - ROCPROFILER_OPENMP_ID_nest_lock, - ROCPROFILER_OPENMP_ID_flush, - ROCPROFILER_OPENMP_ID_cancel, - ROCPROFILER_OPENMP_ID_reduction, - ROCPROFILER_OPENMP_ID_dispatch, - ROCPROFILER_OPENMP_ID_target_emi, - ROCPROFILER_OPENMP_ID_target_data_op_emi, - ROCPROFILER_OPENMP_ID_target_submit_emi, - ROCPROFILER_OPENMP_ID_target_map_emi, - ROCPROFILER_OPENMP_ID_error, - ROCPROFILER_OPENMP_ID_callback_functions, // fake to return struct of ompt callback function - // pointers - ROCPROFILER_OPENMP_ID_LAST -} rocprofiler_ompt_operation_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt index 449737e03a..af6e69e157 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt @@ -5,7 +5,7 @@ rocprofiler_activate_clang_tidy() set(ROCPROFILER_LIB_HEADERS agent.hpp buffer.hpp external_correlation.hpp intercept_table.hpp - internal_threading.hpp registration.hpp runtime_initialization.hpp) + internal_threading.hpp ompt.hpp registration.hpp runtime_initialization.hpp) set(ROCPROFILER_LIB_SOURCES agent.cpp buffer.cpp @@ -18,10 +18,11 @@ set(ROCPROFILER_LIB_SOURCES external_correlation.cpp intercept_table.cpp internal_threading.cpp + ompt.cpp pc_sampling.cpp profile_config.cpp - registration.cpp rocprofiler.cpp + registration.cpp runtime_initialization.cpp) # ----------------------------------------------------------------------------------------# @@ -52,6 +53,7 @@ add_subdirectory(kernel_dispatch) add_subdirectory(page_migration) add_subdirectory(rccl) add_subdirectory(details) +add_subdirectory(ompt) target_link_libraries( rocprofiler-sdk-object-library diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp index fe2e004f9e..1dad1a567a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -30,6 +30,7 @@ #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" +#include "lib/rocprofiler-sdk/ompt/ompt.hpp" #include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" #include "lib/rocprofiler-sdk/rccl/rccl.hpp" #include "lib/rocprofiler-sdk/registration.hpp" @@ -88,7 +89,7 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(PAGE_MIGRATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(SCRATCH_MEMORY) ROCPROFILER_BUFFER_TRACING_KIND_STRING(CORRELATION_ID_RETIREMENT) ROCPROFILER_BUFFER_TRACING_KIND_STRING(RCCL_API) -ROCPROFILER_BUFFER_TRACING_KIND_STRING(OPENMP) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(OMPT) ROCPROFILER_BUFFER_TRACING_KIND_STRING(RUNTIME_INITIALIZATION) template @@ -273,6 +274,11 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ val = rocprofiler::page_migration::name_by_id(operation); break; } + case ROCPROFILER_BUFFER_TRACING_OMPT: + { + val = rocprofiler::ompt::name_by_id(operation); + break; + } case ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION: { val = rocprofiler::runtime_init::name_by_id(operation); @@ -282,10 +288,6 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; } - case ROCPROFILER_BUFFER_TRACING_OPENMP: - { - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; - } }; if(!val) @@ -403,6 +405,11 @@ rocprofiler_iterate_buffer_tracing_kind_operations( ops = rocprofiler::page_migration::get_ids(); break; } + case ROCPROFILER_BUFFER_TRACING_OMPT: + { + ops = rocprofiler::ompt::get_ids(); + break; + } case ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION: { ops = rocprofiler::runtime_init::get_ids(); @@ -412,10 +419,6 @@ rocprofiler_iterate_buffer_tracing_kind_operations( { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; } - case ROCPROFILER_BUFFER_TRACING_OPENMP: - { - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; - } } for(const auto& itr : ops) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp index 9e6f3b7aa0..f3f65a99f4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp @@ -30,6 +30,7 @@ #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" +#include "lib/rocprofiler-sdk/ompt/ompt.hpp" #include "lib/rocprofiler-sdk/rccl/rccl.hpp" #include "lib/rocprofiler-sdk/registration.hpp" #include "lib/rocprofiler-sdk/runtime_initialization.hpp" @@ -85,7 +86,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_COPY) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_ALLOCATION) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(RCCL_API) -ROCPROFILER_CALLBACK_TRACING_KIND_STRING(OPENMP) +ROCPROFILER_CALLBACK_TRACING_KIND_STRING(OMPT) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(RUNTIME_INITIALIZATION) template @@ -255,9 +256,9 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac val = rocprofiler::hsa::async_copy::name_by_id(operation); break; } - case ROCPROFILER_CALLBACK_TRACING_OPENMP: + case ROCPROFILER_CALLBACK_TRACING_OMPT: { - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + val = rocprofiler::ompt::name_by_id(operation); break; } case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -383,9 +384,9 @@ rocprofiler_iterate_callback_tracing_kind_operations( ops = rocprofiler::hsa::async_copy::get_ids(); break; } - case ROCPROFILER_CALLBACK_TRACING_OPENMP: + case ROCPROFILER_CALLBACK_TRACING_OMPT: { - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + ops = rocprofiler::ompt::get_ids(); break; } case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -522,13 +523,22 @@ rocprofiler_iterate_callback_tracing_kind_operation_args( user_data); return ROCPROFILER_STATUS_SUCCESS; } + case ROCPROFILER_CALLBACK_TRACING_OMPT: + { + rocprofiler::ompt::iterate_args( + record.operation, + *static_cast(record.payload), + callback, + max_deref, + user_data); + return ROCPROFILER_STATUS_SUCCESS; + } case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: case ROCPROFILER_CALLBACK_TRACING_RCCL_API: - case ROCPROFILER_CALLBACK_TRACING_OPENMP: case ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp index a3a9a57737..b9ba17d4ff 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp @@ -66,7 +66,7 @@ struct code_object_info; struct code_object_info \ { \ static constexpr auto operation_idx = ROCPROFILER_CODE_OBJECT_##OPERATION; \ - static constexpr auto name = #OPERATION; \ + static constexpr auto name = "CODE_OBJECT_" #OPERATION; \ }; SPECIALIZE_CODE_OBJECT_INFO(NONE) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.cpp index 81f7c0d8b0..0140e44c1b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.cpp @@ -152,20 +152,53 @@ pop_latest_correlation_id(correlation_id* val) return nullptr; } - if(get_latest_correlation_id_impl().empty()) + auto& stack = get_latest_correlation_id_impl(); + if(stack.empty()) { ROCP_ERROR << "empty thread-local correlation id stack"; return nullptr; } - ROCP_ERROR_IF(get_latest_correlation_id_impl().back() != val) - << "pop_latest_correlation_id is happening out of order for " << val->internal - << ". top of stack is " << get_latest_correlation_id_impl().back()->internal; + if(stack.back() != val) + { + ROCP_ERROR << "pop_latest_correlation_id is happening out of order for " << val->internal + << ". top of stack is " << stack.back()->internal; + } - get_latest_correlation_id_impl().pop_back(); + stack.pop_back(); - return (get_latest_correlation_id_impl().empty()) ? nullptr - : get_latest_correlation_id_impl().back(); + return (stack.empty()) ? nullptr : stack.back(); } + +correlation_id* +push_correlation_id(correlation_id* val) +{ + if(!val) + { + ROCP_ERROR << "passed nullptr to correlation id"; + return nullptr; + } + + val->thread_idx = common::get_tid(); + get_latest_correlation_id_impl().emplace_back(val); + + return val; +} + +void +dump_correlation_stack(const char* s) +{ + auto& stack = get_latest_correlation_id_impl(); + auto info = std::stringstream{}; + info << s << ": tid: " << common::get_tid() << " :"; + for(const auto* itr : stack) + { + info << " " << itr->internal; + ; + } + info << "\n"; + printf("%s", info.str().c_str()); +} + } // namespace context } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.hpp index fcd32e7541..f551143a5b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/correlation_id.hpp @@ -86,6 +86,14 @@ get_latest_correlation_id(); const correlation_id* pop_latest_correlation_id(correlation_id*); +// push correlation id +correlation_id* +push_correlation_id(correlation_id*); + +// dump the cid stack for debugging +void +dump_correlation_stack(const char*); + /// permits tools opportunity to modify the correlation id based on the domain, op, and /// the rocprofiler generated correlation id struct correlation_tracing_service diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.cpp new file mode 100644 index 0000000000..d3121a3b90 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.cpp @@ -0,0 +1,251 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk/ompt/ompt.hpp" +#include "lib/common/logging.hpp" +#include "lib/common/static_object.hpp" +#include "lib/rocprofiler-sdk/ompt/details/format.hpp" // NOLINT(unused-includes) +#include "lib/rocprofiler-sdk/registration.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +namespace rocprofiler +{ +namespace ompt +{ +namespace +{ +ompt_start_tool_result_t* +get_start_tool_result() +{ + static auto*& obj = common::static_object::construct(); + return obj; +} + +namespace +{ +auto init_status = std::atomic{0}; +auto fini_status = std::atomic{0}; +ompt_finalize_tool_t tool_finalize = nullptr; +ompt_set_callback_t set_callback = nullptr; +rocprofiler_ompt_callback_functions_t ompt_callback_table = {}; +ompt_get_thread_data_t real_get_thread_data = nullptr; +ompt_get_parallel_info_t real_get_parallel_info = nullptr; +ompt_get_task_info_t real_get_task_info = nullptr; +ompt_get_target_info_t real_get_target_info = nullptr; +} // namespace + +void +set_ompt_callbacks() +{ + // set all the ompt callbacks that might be used + + auto cb = [](const char* cbname, ompt_callback_t* cbf, int cbnum) { + ompt_set_result_t result = set_callback(static_cast(cbnum), *cbf); + ROCP_WARNING_IF(result != ompt_set_always) + << "rocprofiler-sdk OpenMP tools set_callback returned " << result + << fmt::format(" (set result = {})", result) << " for " << cbname << " (id=" << cbnum + << ")"; + }; + ompt::update_table(cb); + ompt::update_callback(ompt_callback_table); +} + +// proxies for some entry points that use ompt_data_t * +ompt_data_t* +proxy_get_thread_data() +{ + ompt_data_t* real = real_get_thread_data(); + auto* ret = ompt::proxy_data_ptr(real); + return ret; +} + +int +proxy_get_parallel_info(int ancestor_level, ompt_data_t** parallel_data, int* team_size) +{ + ompt_data_t* tdata; + int tteam_size; + int ret = real_get_parallel_info(ancestor_level, &tdata, &tteam_size); + if(ret != 2) return ret; + if(team_size != nullptr) *team_size = tteam_size; + if(parallel_data != nullptr) *parallel_data = ompt::proxy_data_ptr(tdata); + return ret; +} + +int +proxy_get_task_info(int ancestor_level, + int* flags, + ompt_data_t** task_data, + ompt_frame_t** task_frame, + ompt_data_t** parallel_data, + int* thread_num) +{ + int tflags, tthread_num; + ompt_data_t * ttask_data, *tparallel_data; + ompt_frame_t* ttask_frame; + int ret = real_get_task_info( + ancestor_level, &tflags, &ttask_data, &ttask_frame, &tparallel_data, &tthread_num); + if(ret != 2) return ret; + if(flags != nullptr) *flags = tflags; + if(task_data != nullptr) *task_data = ompt::proxy_data_ptr(ttask_data); + if(task_frame != nullptr) *task_frame = ttask_frame; + if(parallel_data != nullptr) *parallel_data = ompt::proxy_data_ptr(tparallel_data); + if(thread_num != nullptr) *thread_num = tthread_num; + return ret; +} + +int +proxy_get_target_info(uint64_t* device_num, ompt_id_t* target_id, ompt_id_t* host_op_id) +{ + uint64_t tdevice_num; + ompt_id_t ttarget_id, thost_op_id; + int ret = real_get_target_info(&tdevice_num, &ttarget_id, &thost_op_id); + if(ret != 1) return ret; + if(device_num != nullptr) *device_num = tdevice_num; + if(target_id != nullptr) *target_id = ttarget_id; + if(host_op_id != nullptr) *host_op_id = thost_op_id; + return ret; +} + +#define SETCB(name) \ + ompt_callback_table.ompt_##name = reinterpret_cast(lookup("ompt_" #name)); \ + LOG_IF(FATAL, ompt_callback_table.ompt_##name == nullptr) \ + << "rocprofiler-sdk OMPT cannot find " \ + "ompt_" #name + +#define PROXYCB(name) \ + SETCB(name); \ + real_##name = ompt_callback_table.ompt_##name; \ + ompt_callback_table.ompt_##name = proxy_##name + +int +initialize(ompt_function_lookup_t lookup, int /*initial_device_num*/, ompt_data_t* /*tool_data*/) +{ + init_status.store(-1); + + tool_finalize = static_cast(lookup("ompt_finalize_tool")); + LOG_IF(FATAL, tool_finalize == nullptr) + << "rocprofiler-sdk OMPT cannot find ompt_finalize_tool"; + + set_callback = reinterpret_cast(lookup("ompt_set_callback")); + LOG_IF(FATAL, set_callback == nullptr) << "rocprofiler-sdk OMPT cannot find ompt_set_callback"; + + SETCB(enumerate_states); + SETCB(enumerate_mutex_impls); + PROXYCB(get_thread_data); // <- need proxy + SETCB(get_num_places); + SETCB(get_place_proc_ids); + SETCB(get_place_num); + SETCB(get_partition_place_nums); + SETCB(get_proc_id); + SETCB(get_state); + PROXYCB(get_parallel_info); // <= need proxy + PROXYCB(get_task_info); // <= need proxy + SETCB(get_task_memory); + SETCB(get_num_devices); + SETCB(get_num_procs); + PROXYCB(get_target_info); // <= need proxy + SETCB(get_unique_id); + + set_ompt_callbacks(); + init_status.store(1); + + return 1; // bizarre abberation in the OMPT spec, not 0 +} +#undef SETCB + +void +finalize(ompt_data_t* /*tool_data*/) +{ + fini_status.store(-1); + + // do whatever for finalization + + fini_status.store(1); +} +} // namespace + +void +finalize_ompt() +{ + if(rocprofiler::ompt::tool_finalize) rocprofiler::ompt::tool_finalize(); +} +} // namespace ompt +} // namespace rocprofiler + +extern "C" { + +rocprofiler_status_t +rocprofiler_ompt_is_initialized(int* status) +{ + *status = ::rocprofiler::ompt::init_status.load(); + return ROCPROFILER_STATUS_SUCCESS; +} + +rocprofiler_status_t +rocprofiler_ompt_is_finalized(int* status) +{ + *status = ::rocprofiler::ompt::fini_status.load(); + return ROCPROFILER_STATUS_SUCCESS; +} + +ompt_start_tool_result_t* +rocprofiler_ompt_start_tool(unsigned int /*omp_version*/, const char* /*runtime_version*/) +{ + // log to clog since logging probably won't be initialized here + auto _init_status = ::rocprofiler::ompt::init_status.load(); + if(_init_status != 0) + { + std::clog << "ERROR: rocprofiler-sdk OMPT backend has already been initialized: " + << _init_status << '\n'; + return nullptr; + } + + // don't check contexts here, client tool may not be initialized + auto* _result = ::rocprofiler::ompt::get_start_tool_result(); + + if(_result) + { + _result->initialize = ::rocprofiler::ompt::initialize; + _result->finalize = ::rocprofiler::ompt::finalize; + } + + return _result; +} + +ompt_start_tool_result_t* +ompt_start_tool(unsigned int omp_version, const char* runtime_version) ROCPROFILER_PUBLIC_API; + +ompt_start_tool_result_t* +ompt_start_tool(unsigned int omp_version, const char* runtime_version) +{ + ::rocprofiler::registration::initialize(); + return rocprofiler_ompt_start_tool(omp_version, runtime_version); +} +} diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp.h b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.hpp similarity index 91% rename from projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp.h rename to projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.hpp index 667c99d8b6..0dd5e07b9a 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/openmp.h +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt.hpp @@ -22,5 +22,11 @@ #pragma once -#include -#include +namespace rocprofiler +{ +namespace ompt +{ +void +finalize_ompt(); +} // namespace ompt +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/CMakeLists.txt new file mode 100644 index 0000000000..98cb8e04ad --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/CMakeLists.txt @@ -0,0 +1,8 @@ +# +# +set(ROCPROFILER_LIB_OMPT_SOURCES ompt.cpp) +set(ROCPROFILER_LIB_OMPT_HEADERS defines.hpp ompt.hpp utils.hpp) + +target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_OMPT_SOURCES}) + +add_subdirectory(details) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/defines.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/defines.hpp new file mode 100644 index 0000000000..8aaa8e52e5 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/defines.hpp @@ -0,0 +1,155 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" + +#define OMPT_INFO_DEFINITION_0( \ + OMPT_NAME, BACKEND_OMPT_ID, OMPT_ID, OMPT_FUNC_TYPE, OMPT_ARG, OMPT_FUNC_PTR) \ + namespace rocprofiler \ + { \ + namespace ompt \ + { \ + template <> \ + struct ompt_info : ompt_domain_info \ + { \ + static constexpr auto ompt_idx = BACKEND_OMPT_ID; \ + static constexpr auto operation_idx = OMPT_ID; \ + static constexpr auto name = OMPT_NAME; \ + static constexpr auto unsupported = false; \ + \ + using domain_type = ompt_domain_info; \ + using this_type = ompt_info; \ + \ + static constexpr auto callback_domain_idx = domain_type::callback_domain_idx; \ + static constexpr auto buffered_domain_idx = domain_type::buffered_domain_idx; \ + \ + using domain_type::args_type; \ + using domain_type::retval_type; \ + using domain_type::callback_data_type; \ + \ + static constexpr auto offset() \ + { \ + return offsetof(ompt_table_lookup::type, OMPT_FUNC_PTR); \ + } \ + \ + static auto& get_table() { return ompt_table_lookup{}(); } \ + \ + template \ + static auto& get_table_func(TableT& _table) \ + { \ + if constexpr(std::is_pointer::value) \ + { \ + return CHECK_NOTNULL(_table)->OMPT_FUNC_PTR; \ + } \ + else \ + { \ + return _table.OMPT_FUNC_PTR; \ + } \ + } \ + \ + static auto& get_table_func() { return get_table_func(get_table()); } \ + \ + template \ + static auto& get_api_data_args(DataT& _data) \ + { \ + return _data.OMPT_ARG; \ + } \ + \ + static std::vector as_arg_addr(callback_data_type) { return std::vector{}; } \ + \ + static std::vector as_arg_list(callback_data_type, int32_t) \ + { \ + return {}; \ + } \ + }; \ + } \ + } + +#define OMPT_INFO_DEFINITION_V( \ + OMPT_NAME, BACKEND_OMPT_ID, OMPT_ID, OMPT_FUNC_TYPE, OMPT_ARG, OMPT_FUNC_PTR, ...) \ + namespace rocprofiler \ + { \ + namespace ompt \ + { \ + template <> \ + struct ompt_info : ompt_domain_info \ + { \ + static constexpr auto ompt_idx = BACKEND_OMPT_ID; \ + static constexpr auto operation_idx = OMPT_ID; \ + static constexpr auto name = OMPT_NAME; \ + static constexpr auto unsupported = false; \ + \ + using domain_type = ompt_domain_info; \ + using this_type = ompt_info; \ + \ + static constexpr auto callback_domain_idx = domain_type::callback_domain_idx; \ + static constexpr auto buffered_domain_idx = domain_type::buffered_domain_idx; \ + \ + using domain_type::args_type; \ + using domain_type::retval_type; \ + using domain_type::callback_data_type; \ + \ + static constexpr auto offset() \ + { \ + return offsetof(ompt_table_lookup::type, OMPT_FUNC_PTR); \ + } \ + \ + static auto& get_table() { return ompt_table_lookup{}(); } \ + \ + template \ + static auto& get_table_func(TableT& _table) \ + { \ + if constexpr(std::is_pointer::value) \ + { \ + return CHECK_NOTNULL(_table)->OMPT_FUNC_PTR; \ + } \ + else \ + { \ + return _table.OMPT_FUNC_PTR; \ + } \ + } \ + \ + static auto& get_table_func() { return get_table_func(get_table()); } \ + \ + template \ + static auto& get_api_data_args(DataT& _data) \ + { \ + return _data.OMPT_ARG; \ + } \ + \ + static std::vector as_arg_addr(callback_data_type trace_data) \ + { \ + return std::vector{ \ + GET_ADDR_MEMBER_FIELDS(get_api_data_args(trace_data.args), __VA_ARGS__)}; \ + } \ + \ + static auto as_arg_list(callback_data_type trace_data, int32_t max_deref) \ + { \ + return utils::stringize( \ + max_deref, \ + GET_NAMED_MEMBER_FIELDS(get_api_data_args(trace_data.args), __VA_ARGS__)); \ + } \ + }; \ + } \ + } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/CMakeLists.txt new file mode 100644 index 0000000000..e2c92a7a93 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/CMakeLists.txt @@ -0,0 +1,9 @@ +# +# +# +set(ROCPROFILER_LIB_OMPT_DETAILS_SOURCES) +set(ROCPROFILER_LIB_OMPT_DETAILS_HEADERS format.hpp) + +target_sources( + rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_OMPT_DETAILS_SOURCES} + ${ROCPROFILER_LIB_OMPT_DETAILS_HEADERS}) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/format.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/format.hpp new file mode 100644 index 0000000000..c0423091fa --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/details/format.hpp @@ -0,0 +1,279 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +#include "fmt/core.h" + +#define ROCP_SDK_OMPT_FORMATTER(TYPE, ...) \ + template <> \ + struct formatter : rocprofiler::ompt::details::base_formatter \ + { \ + template \ + auto format(const TYPE& v, Ctx& ctx) const \ + { \ + return fmt::format_to(ctx.out(), __VA_ARGS__); \ + } \ + }; + +#define ROCP_SDK_OMPT_FORMAT_CASE_STMT(PREFIX, SUFFIX) \ + case PREFIX##_##SUFFIX: return fmt::format_to(ctx.out(), #SUFFIX) + +namespace rocprofiler +{ +namespace ompt +{ +namespace details +{ +struct base_formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } +}; +} // namespace details +} // namespace ompt +} // namespace rocprofiler + +namespace fmt +{ +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_set_result_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, error); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, never); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, impossible); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, sometimes); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, sometimes_paired); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_set, always); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_thread_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_thread, initial); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_thread, worker); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_thread, other); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt_thread, unknown); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_scope_endpoint_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, scope_begin); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, scope_end); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, scope_beginend); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_dispatch_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, dispatch_iteration); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, dispatch_section); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, dispatch_ws_loop_chunk); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, dispatch_taskloop_chunk); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, dispatch_distribute_chunk); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_sync_region_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_explicit); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_implementation); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_taskwait); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_taskgroup); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_reduction); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit_workshare); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit_parallel); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, sync_region_barrier_teams); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(ompt_target_data_op_t v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_alloc); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_transfer_to_device); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_transfer_from_device); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_delete); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_associate); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_disassociate); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_alloc_async); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_transfer_to_device_async); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_transfer_from_device_async); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, target_data_delete_async); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_data_t& v, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), "{}", v.value); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_work_t& v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_loop); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_sections); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_single_executor); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_single_other); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_workshare); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_distribute); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_taskloop); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_scope); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_loop_static); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_loop_dynamic); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_loop_guided); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, work_loop_other); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_task_status_t& v, Ctx& ctx) const + { + switch(v) + { + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_complete); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_yield); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_cancel); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_detach); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_early_fulfill); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_late_fulfill); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, task_switch); + ROCP_SDK_OMPT_FORMAT_CASE_STMT(ompt, taskwait_complete); + } + return fmt::format_to(ctx.out(), "Unknown"); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_frame_t& v, Ctx& ctx) const + { + return fmt::format_to( + ctx.out(), + "{}exit_frame={}, enter_frame={}, exit_frame_flags={}, enter_frame_flags={}{}", + '{', + v.exit_frame, + v.enter_frame, + v.exit_frame_flags, + v.enter_frame_flags, + '}'); + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_dependence_t& v, Ctx& ctx) const + { + // stub + return fmt::format_to(ctx.out(), "(dependence)"); + (void) v; + } +}; + +template <> +struct formatter : rocprofiler::ompt::details::base_formatter +{ + template + auto format(const ompt_dispatch_chunk_t& v, Ctx& ctx) const + { + return fmt::format_to( + ctx.out(), "{}start={}, iterations={}{}", '{', v.start, v.iterations, '}'); + } +}; +} // namespace fmt + +#undef ROCP_SDK_OMPT_FORMATTER +#undef ROCP_SDK_OMPT_OSTREAM_FORMATTER +#undef ROCP_SDK_OMPT_FORMAT_CASE_STMT diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.cpp new file mode 100644 index 0000000000..7d23f1c6b3 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.cpp @@ -0,0 +1,1162 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "lib/rocprofiler-sdk/ompt/ompt.hpp" +#include "lib/common/logging.hpp" +#include "lib/common/string_entry.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/context/correlation_id.hpp" +#include "lib/rocprofiler-sdk/tracing/fwd.hpp" +#include "lib/rocprofiler-sdk/tracing/tracing.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace ompt +{ +namespace +{ +ompt_table& +get_table(); + +struct ompt_table_lookup +{ + using type = ompt_table; + auto& operator()(type& _v) const { return _v; } + auto& operator()(type* _v) const { return *_v; } + auto& operator()() const { return (*this)(get_table()); } +}; +} // namespace +} // namespace ompt +} // namespace rocprofiler + +#define ROCPROFILER_LIB_ROCPROFILER_OMPT_OMPT_CPP_IMPL 1 +#include "ompt.def.cpp" +#undef ROCPROFILER_LIB_ROCPROFILER_OMPT_OMPT_CPP_IMPL + +namespace rocprofiler +{ +namespace ompt +{ +namespace +{ +auto& +get_ompt_state_stack() +{ + // for callbacks that don't have a place to stash context, we assume + // a per-thread stack. otherwise we stash the saved state in the ompt_data_t field. + static thread_local auto _v = tracing::small_vector_t{}; + return _v; +} + +auto* +get_ompt_data_proxy() +{ + static auto*& _v = common::static_object::construct(); + return _v; +} + +// Macros for access to appropriate ompt_data_t* proxy +#define CLIENT(name) (CHECK_NOTNULL(get_ompt_data_proxy())->get_client_ptr(name)) +#define INTERNAL(name) (CHECK_NOTNULL(get_ompt_data_proxy())->get_internal_ptr(name)) + +void +ompt_thread_begin_callback(ompt_thread_t thread_type, ompt_data_t* thread_data) +{ + ompt_impl::event(thread_type, CLIENT(thread_data)); +} + +void +ompt_thread_end_callback(ompt_data_t* thread_data) +{ + ompt_impl::event(CLIENT(thread_data)); +} + +void +ompt_parallel_begin_callback(ompt_data_t* encountering_task_data, + const ompt_frame_t* encountering_task_frame, + ompt_data_t* parallel_data, + unsigned int requested_parallelism, + int flags, + const void* codeptr_ra) +{ + ompt_impl::event(CLIENT(encountering_task_data), + encountering_task_frame, + CLIENT(parallel_data), + requested_parallelism, + flags, + codeptr_ra); +} + +void +ompt_parallel_end_callback(ompt_data_t* parallel_data, + ompt_data_t* encountering_task_data, + int flags, + const void* codeptr_ra) +{ + ompt_impl::event( + CLIENT(parallel_data), CLIENT(encountering_task_data), flags, codeptr_ra); +} + +void +ompt_task_create_callback(ompt_data_t* encountering_task_data, + const ompt_frame_t* encountering_task_frame, + ompt_data_t* new_task_data, + int flags, + int has_dependences, + const void* codeptr_ra) +{ + auto* corr_id = + ompt_impl::event_common(CLIENT(encountering_task_data), + encountering_task_frame, + CLIENT(new_task_data), + flags, + has_dependences, + codeptr_ra); + + auto* state = new ompt_task_save_state{corr_id, flags}; + INTERNAL(new_task_data)->ptr = state; + + context::pop_latest_correlation_id(corr_id); +} + +void +ompt_task_schedule_callback(ompt_data_t* prior_task_data, + ompt_task_status_t prior_task_status, + ompt_data_t* next_task_data) +{ + auto* corr_id = ompt_impl::event_common( + CLIENT(prior_task_data), prior_task_status, CLIENT(next_task_data)); + context::pop_latest_correlation_id(corr_id); + corr_id->sub_ref_count(); + + auto* pprior = INTERNAL(prior_task_data); + auto* pnext = INTERNAL(next_task_data); + assert(pprior != nullptr); + auto* state_prior = reinterpret_cast(pprior->ptr); + auto* state_next = pnext ? reinterpret_cast(pnext->ptr) : nullptr; + auto* prior_corrid = context::get_latest_correlation_id(); + if(state_prior->corr_id == prior_corrid && state_prior->task_flags != 0) + { + // pop the current correlation ID (for the prior_task) + assert((state_prior->task_flags & 0xFF) == ompt_task_explicit); + context::pop_latest_correlation_id(prior_corrid); + } + if(state_next && (state_next->task_flags & 0xFF) == ompt_task_explicit) + { + // push the next correlation ID (for the next_task) + context::push_correlation_id(state_next->corr_id); + } + if(prior_task_status == ompt_task_yield || prior_task_status == ompt_task_detach || + prior_task_status == ompt_task_switch) + return; + // the prior task is done + assert(state_prior->task_flags != 0); + if(prior_task_status == ompt_task_complete) + { + // FIXME? do we need to decrement the ref count + // state_prior->corr_id->sub_ref_count(); + delete state_prior; + pprior->ptr = nullptr; + } +} + +void +ompt_implicit_task_callback(ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + unsigned int actual_parallelism, + unsigned int index, + int flags) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin(INTERNAL(task_data), + endpoint, + CLIENT(parallel_data), + CLIENT(task_data), + actual_parallelism, + index, + flags); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end(INTERNAL(task_data), + endpoint, + CLIENT(parallel_data), + CLIENT(task_data), + actual_parallelism, + index, + flags); + } + else + { + ROCP_FATAL << "endpoint in implicit_task is not begin or end: " << endpoint; + } +} + +void +ompt_device_initialize_callback(int device_num, + const char* type, + ompt_device_t* device, + ompt_function_lookup_t lookup, + const char* documentation) +{ + ompt_impl::event( + device_num, type, device, lookup, documentation); +} + +void +ompt_device_finalize_callback(int device_num) +{ + ompt_impl::event(device_num); +} + +void +ompt_device_load_callback(int device_num, + const char* filename, + int64_t offset_in_file, + void* vma_in_file, + size_t bytes, + void* host_addr, + void* device_addr, + uint64_t module_id) +{ + ompt_impl::event(device_num, + filename, + offset_in_file, + vma_in_file, + bytes, + host_addr, + device_addr, + module_id); +} + +// void +// ompt_device_unload_callback(int device_num, uint64_t module_id) +// { +// ompt_impl::event(device_num, module_id); +// } + +void +ompt_sync_region_wait_callback(ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in sync_region_wait is not begin or end: " << endpoint; + } +} + +void +ompt_mutex_released_callback(ompt_mutex_t kind, ompt_wait_id_t wait_id, const void* codeptr_ra) +{ + ompt_impl::event(kind, wait_id, codeptr_ra); +} + +void +ompt_dependences_callback(ompt_data_t* task_data, const ompt_dependence_t* deps, int ndeps) +{ + ompt_impl::event(CLIENT(task_data), deps, ndeps); +} + +void +ompt_task_dependence_callback(ompt_data_t* src_task_data, ompt_data_t* sink_task_data) +{ + ompt_impl::event(CLIENT(src_task_data), + CLIENT(sink_task_data)); +} + +void +ompt_work_callback(ompt_work_t work_type, + ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + uint64_t count, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin(nullptr, + work_type, + endpoint, + CLIENT(parallel_data), + CLIENT(task_data), + count, + codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end(nullptr, + work_type, + endpoint, + CLIENT(parallel_data), + CLIENT(task_data), + count, + codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in work is not begin or end: " << endpoint; + } +} + +void +ompt_masked_callback(ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin( + nullptr, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end( + nullptr, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in masked is not begin or end: " << endpoint; + } +} + +void +ompt_target_map_callback(ompt_id_t target_id, + unsigned int nitems, + void** host_addr, + void** device_addr, + size_t* bytes, + unsigned int* mapping_flags, + const void* codeptr_ra) +{ + common::consume_args( + target_id, nitems, host_addr, device_addr, bytes, mapping_flags, codeptr_ra); +} + +void +ompt_sync_region_callback(ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in sync_region is not begin or end: " << endpoint; + } +} + +void +ompt_lock_init_callback(ompt_mutex_t kind, + unsigned int hint, + unsigned int impl, + ompt_wait_id_t wait_id, + const void* codeptr_ra) +{ + ompt_impl::event(kind, hint, impl, wait_id, codeptr_ra); +} + +void +ompt_lock_destroy_callback(ompt_mutex_t kind, ompt_wait_id_t wait_id, const void* codeptr_ra) +{ + ompt_impl::event(kind, wait_id, codeptr_ra); +} + +void +ompt_mutex_acquire_callback(ompt_mutex_t kind, + unsigned int hint, + unsigned int impl, + ompt_wait_id_t wait_id, + const void* codeptr_ra) +{ + ompt_impl::event(kind, hint, impl, wait_id, codeptr_ra); +} + +void +ompt_mutex_acquired_callback(ompt_mutex_t kind, ompt_wait_id_t wait_id, const void* codeptr_ra) +{ + ompt_impl::event(kind, wait_id, codeptr_ra); +} + +void +ompt_nest_lock_callback(ompt_scope_endpoint_t endpoint, + ompt_wait_id_t wait_id, + const void* codeptr_ra) +{ + ompt_impl::event(endpoint, wait_id, codeptr_ra); +} + +void +ompt_flush_callback(ompt_data_t* thread_data, const void* codeptr_ra) +{ + ompt_impl::event(CLIENT(thread_data), codeptr_ra); +} + +void +ompt_cancel_callback(ompt_data_t* task_data, int flags, const void* codeptr_ra) +{ + ompt_impl::event(CLIENT(task_data), flags, codeptr_ra); +} + +void +ompt_reduction_callback(ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t* parallel_data, + ompt_data_t* task_data, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end( + nullptr, kind, endpoint, CLIENT(parallel_data), CLIENT(task_data), codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in reduction is not begin or end: " << endpoint; + } +} + +void +ompt_dispatch_callback(ompt_data_t* parallel_data, + ompt_data_t* task_data, + ompt_dispatch_t kind, + ompt_data_t instance) +{ + ompt_impl::event( + CLIENT(parallel_data), CLIENT(task_data), kind, instance); +} + +void +ompt_target_emi_callback(ompt_target_t kind, + ompt_scope_endpoint_t endpoint, + int device_num, + ompt_data_t* task_data, + ompt_data_t* target_task_data, + ompt_data_t* target_data, + const void* codeptr_ra) +{ + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin(INTERNAL(target_data), + kind, + endpoint, + device_num, + CLIENT(task_data), + CLIENT(target_task_data), + CLIENT(target_data), + codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end(INTERNAL(target_data), + kind, + endpoint, + device_num, + CLIENT(task_data), + CLIENT(target_task_data), + CLIENT(target_data), + codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in target_emi is not begin or end: " << endpoint; + } +} + +void +ompt_target_data_op_emi_callback(ompt_scope_endpoint_t endpoint, + ompt_data_t* target_task_data, + ompt_data_t* target_data, + ompt_id_t* host_op_id, + ompt_target_data_op_t optype, + void* src_address, + int src_device_num, + void* dst_address, + int dst_device_num, + size_t bytes, + const void* codeptr_ra) +{ + auto* _host_op_data = reinterpret_cast(host_op_id); + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin(INTERNAL(_host_op_data), + endpoint, + CLIENT(target_task_data), + CLIENT(target_data), + CLIENT(_host_op_data), + optype, + src_address, + src_device_num, + dst_address, + dst_device_num, + bytes, + codeptr_ra); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end(INTERNAL(_host_op_data), + endpoint, + CLIENT(target_task_data), + CLIENT(target_data), + CLIENT(_host_op_data), + optype, + src_address, + src_device_num, + dst_address, + dst_device_num, + bytes, + codeptr_ra); + } + else + { + ROCP_FATAL << "endpoint in target_data_op_emi is not begin or end: " << endpoint; + } +} + +void +ompt_target_submit_emi_callback(ompt_scope_endpoint_t endpoint, + ompt_data_t* target_data, + ompt_id_t* host_op_id, + unsigned int requested_num_teams) +{ + auto* _host_op_data = reinterpret_cast(host_op_id); + if(endpoint == ompt_scope_begin) + { + ompt_impl::begin(INTERNAL(_host_op_data), + endpoint, + CLIENT(target_data), + CLIENT(_host_op_data), + requested_num_teams); + } + else if(endpoint == ompt_scope_end) + { + ompt_impl::end(INTERNAL(_host_op_data), + endpoint, + CLIENT(target_data), + CLIENT(_host_op_data), + requested_num_teams); + } + else + { + ROCP_FATAL << "endpoint in target_submit_emi is not begin or end: " << endpoint; + } + (void) target_data; +} + +// void +// ompt_target_map_emi_callback(ompt_data_t* target_data, +// unsigned int nitems, +// void** host_addr, +// void** device_addr, +// size_t* bytes, +// unsigned int* mapping_flags, +// const void* codeptr_ra) +// { +// common::consume_args( +// target_data, nitems, host_addr, device_addr, bytes, mapping_flags, codeptr_ra); +// } + +void +ompt_error_callback(ompt_severity_t severity, + const char* message, + size_t length, + const void* codeptr_ra) +{ + ompt_impl::event(severity, message, length, codeptr_ra); +} +#undef CLIENT +#undef INTERNAL + +// The ompt callback table +ompt_table ompt_callback_table = { + ompt_thread_begin_callback, + ompt_thread_end_callback, + ompt_parallel_begin_callback, + ompt_parallel_end_callback, + ompt_task_create_callback, + ompt_task_schedule_callback, + ompt_implicit_task_callback, + ompt_device_initialize_callback, + ompt_device_finalize_callback, + ompt_device_load_callback, + // ompt_device_unload_callback, + ompt_sync_region_wait_callback, + ompt_mutex_released_callback, + ompt_dependences_callback, + ompt_task_dependence_callback, + ompt_work_callback, + ompt_masked_callback, + ompt_target_map_callback, + ompt_sync_region_callback, + ompt_lock_init_callback, + ompt_lock_destroy_callback, + ompt_mutex_acquire_callback, + ompt_mutex_acquired_callback, + ompt_nest_lock_callback, + ompt_flush_callback, + ompt_cancel_callback, + ompt_reduction_callback, + ompt_dispatch_callback, + ompt_target_emi_callback, + ompt_target_data_op_emi_callback, + ompt_target_submit_emi_callback, + // ompt_target_map_emi_callback, + ompt_error_callback, +}; + +ompt_table& +get_table() +{ + return ompt_callback_table; +} + +void +rocprof_ompt_cb_interface(rocprofiler_ompt_callback_functions_t& cb_functions) +{ + ompt_impl::event(cb_functions); +} +} // namespace + +ompt_data_t* +proxy_data_ptr(ompt_data_t* realptr) +{ + return (get_ompt_data_proxy())->get_client_ptr(realptr); +} + +// special case fake callback to send the ompt cb function pointers +template <> +struct ompt_info +{ + static constexpr auto callback_domain_idx = ompt_domain_info::callback_domain_idx; + static constexpr auto buffered_domain_idx = ompt_domain_info::buffered_domain_idx; + static constexpr auto operation_idx = ROCPROFILER_OMPT_ID_callback_functions; + static constexpr auto name = "omp_callback_functions"; + static constexpr bool unsupported = false; + static constexpr auto begin = -1; + + using this_type = ompt_info; + using base_type = ompt_impl; + + static constexpr auto offset() { return -1; } + + template + static auto& get_api_data_args(DataT& _data) + { + return _data.callback_functions; + } +}; + +// These implement the callbacks for OMPT +template +template +void +ompt_impl::begin(ompt_data_t* data, Args... args) +{ + using info_type = ompt_info; + + ROCP_TRACE << __FUNCTION__ << " :: " << info_type::name; + + constexpr auto external_corr_id_domain_idx = + ompt_domain_info::external_correlation_id_domain_idx; + + constexpr auto ref_count = 2; + auto thr_id = common::get_tid(); + auto callback_contexts = tracing::callback_context_data_vec_t{}; + auto buffered_contexts = tracing::buffered_context_data_vec_t{}; + auto external_corr_ids = tracing::external_correlation_id_map_t{}; + + tracing::populate_contexts(info_type::callback_domain_idx, + info_type::buffered_domain_idx, + info_type::operation_idx, + callback_contexts, + buffered_contexts, + external_corr_ids); + + auto* corr_id = tracing::correlation_service::construct(ref_count); + auto internal_corr_id = corr_id->internal; + + tracing::populate_external_correlation_ids(external_corr_ids, + thr_id, + external_corr_id_domain_idx, + info_type::operation_idx, + internal_corr_id); + + // invoke the callbacks + if(!callback_contexts.empty()) + { + auto tracer_data = common::init_public_api_struct(callback_ompt_data_t{}); + set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward(args)...); + + tracing::execute_phase_enter_callbacks(callback_contexts, + thr_id, + internal_corr_id, + external_corr_ids, + info_type::callback_domain_idx, + info_type::operation_idx, + tracer_data); + } + + // enter callback may update the external correlation id field + tracing::update_external_correlation_ids( + external_corr_ids, thr_id, external_corr_id_domain_idx); + + // stash the state + ompt_save_state* state = new ompt_save_state{.thr_id = thr_id, + .start_timestamp = 0, + .operation_idx = info_type::operation_idx, + .corr_id = corr_id, + .external_corr_ids = external_corr_ids, + .callback_contexts = callback_contexts, + .buffered_contexts = buffered_contexts}; + + if(data) + data->ptr = state; + else + get_ompt_state_stack().emplace_back(state); + + // decrement the reference count before returning + corr_id->sub_ref_count(); + state->start_timestamp = common::timestamp_ns(); +} + +template +template +void +ompt_impl::end(ompt_data_t* data, Args... args) +{ + using info_type = ompt_info; + + ROCP_TRACE << __FUNCTION__ << " :: " << info_type::name; + + // END PART OF OMPT CALLBACK + auto end_timestamp = common::timestamp_ns(); + + ompt_save_state* state = nullptr; + if(data != nullptr) + state = static_cast(data->ptr); + else + state = get_ompt_state_stack().pop_back_val(); + assert(state != nullptr); + + ROCP_FATAL_IF(state->operation_idx != info_type::operation_idx) + << "Mismatch of OMPT operation: begin=" << state->operation_idx + << ", end=" << info_type::operation_idx; + + auto& callback_contexts = state->callback_contexts; + auto& buffered_contexts = state->buffered_contexts; + auto external_corr_ids = state->external_corr_ids; + + auto* corr_id = state->corr_id; + auto internal_corr_id = corr_id->internal; + + ROCP_FATAL_IF(common::get_tid() != state->thr_id) + << "MIsmatch of OMPT begin/end thread id: " + << " current=" << common::get_tid() << ", expected= " << state->thr_id; + + if(!callback_contexts.empty()) + { + auto tracer_data = common::init_public_api_struct(callback_ompt_data_t{}); + set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward(args)...); + + tracing::execute_phase_exit_callbacks(callback_contexts, + external_corr_ids, + info_type::callback_domain_idx, + info_type::operation_idx, + tracer_data); + } + + if(!buffered_contexts.empty()) + { + auto buffer_record = common::init_public_api_struct(buffer_ompt_record_t{}); + if constexpr(OpIdx == ROCPROFILER_OMPT_ID_target_emi || + OpIdx == ROCPROFILER_OMPT_ID_target_data_op_emi || + OpIdx == ROCPROFILER_OMPT_ID_target_submit_emi) + { + auto tracer_data = common::init_public_api_struct(callback_ompt_data_t{}); + set_data_args(info_type::get_api_data_args(tracer_data.args), + std::forward(args)...); + if constexpr(OpIdx == ROCPROFILER_OMPT_ID_target_emi) + { + buffer_record.target.kind = tracer_data.args.target_emi.kind; + buffer_record.target.device_num = tracer_data.args.target_emi.device_num; + buffer_record.target.task_id = tracer_data.args.target_emi.task_data->value; + buffer_record.target.target_id = tracer_data.args.target_emi.target_data->value; + buffer_record.target.codeptr_ra = tracer_data.args.target_emi.codeptr_ra; + } + else if constexpr(OpIdx == ROCPROFILER_OMPT_ID_target_data_op_emi) + { + buffer_record.target_data_op.host_op_id = + tracer_data.args.target_data_op_emi.host_op_id->value; + buffer_record.target_data_op.optype = tracer_data.args.target_data_op_emi.optype; + buffer_record.target_data_op.src_device_num = + tracer_data.args.target_data_op_emi.src_device_num; + buffer_record.target_data_op.dst_device_num = + tracer_data.args.target_data_op_emi.dst_device_num; + buffer_record.target_data_op.reserved = 0; + buffer_record.target_data_op.bytes = tracer_data.args.target_data_op_emi.bytes; + buffer_record.target_data_op.codeptr_ra = + tracer_data.args.target_data_op_emi.codeptr_ra; + } + else if constexpr(OpIdx == ROCPROFILER_OMPT_ID_target_submit_emi) + { + buffer_record.target_kernel.device_num = 0; // FIXME + buffer_record.target_kernel.requested_num_teams = + tracer_data.args.target_submit_emi.requested_num_teams; + buffer_record.target_kernel.host_op_id = + tracer_data.args.target_submit_emi.host_op_id->value; + } + } + + buffer_record.start_timestamp = state->start_timestamp; + buffer_record.end_timestamp = end_timestamp; + tracing::execute_buffer_record_emplace(buffered_contexts, + state->thr_id, + internal_corr_id, + external_corr_ids, + info_type::buffered_domain_idx, + info_type::operation_idx, + buffer_record); + } + + // decrement the reference count after usage in the callback/buffers + state->corr_id->sub_ref_count(); + context::pop_latest_correlation_id(state->corr_id); + delete state; + if(data) data->ptr = nullptr; +} + +template +template +context::correlation_id* +ompt_impl::event_common(Args... args) +{ + using info_type = ompt_info; + + ROCP_TRACE << __FUNCTION__ << " :: " << info_type::name; + + constexpr auto external_corr_id_domain_idx = + ompt_domain_info::external_correlation_id_domain_idx; + + constexpr auto ref_count = 1; + auto thr_id = common::get_tid(); + auto callback_contexts = tracing::callback_context_data_vec_t{}; + auto buffered_contexts = tracing::buffered_context_data_vec_t{}; + auto external_corr_ids = tracing::external_correlation_id_map_t{}; + + tracing::populate_contexts(info_type::callback_domain_idx, + info_type::buffered_domain_idx, + info_type::operation_idx, + callback_contexts, + buffered_contexts, + external_corr_ids); + + auto buffer_record = common::init_public_api_struct(buffer_ompt_record_t{}); + auto tracer_data = common::init_public_api_struct(callback_ompt_data_t{}); + auto* corr_id = tracing::correlation_service::construct(ref_count); + uint64_t internal_corr_id = corr_id->internal; + + tracing::populate_external_correlation_ids(external_corr_ids, + thr_id, + external_corr_id_domain_idx, + info_type::operation_idx, + internal_corr_id); + + // invoke the callbacks + if(!callback_contexts.empty()) + { + set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward(args)...); + + tracing::execute_phase_none_callbacks(callback_contexts, + thr_id, + internal_corr_id, + external_corr_ids, + info_type::callback_domain_idx, + info_type::operation_idx, + tracer_data); + } + + tracing::update_external_correlation_ids( + external_corr_ids, thr_id, external_corr_id_domain_idx); + + if(!buffered_contexts.empty()) + { + buffer_record.start_timestamp = common::timestamp_ns(); + buffer_record.end_timestamp = buffer_record.start_timestamp; + tracing::execute_buffer_record_emplace(buffered_contexts, + thr_id, + internal_corr_id, + external_corr_ids, + info_type::buffered_domain_idx, + info_type::operation_idx, + buffer_record); + } + + return corr_id; +} + +template +template +void +ompt_impl::event(Args&&... args) +{ + auto corr_id = ompt_impl::event_common(std::forward(args)...); + context::pop_latest_correlation_id(corr_id); + corr_id->sub_ref_count(); +} + +namespace +{ +template +decltype(auto) +convert_arg(Tp&& _arg) +{ + using type = common::mpl::unqualified_type_t; + if constexpr(common::mpl::is_string_type::value) + { + if(!_arg) return std::remove_reference_t(_arg); + return common::get_string_entry(std::string_view{_arg})->c_str(); + } + else + return std::forward(_arg); +} + +template +void +get_ids(std::vector& _id_list, std::index_sequence) +{ + auto _idx = ompt_info::operation_idx; + if(_idx < ompt_domain_info::last) _id_list.emplace_back(_idx); + + if constexpr(sizeof...(OpIdxTail) > 0) get_ids(_id_list, std::index_sequence{}); +} + +template +const char* +name_by_id(const uint32_t id, std::index_sequence) +{ + if(OpIdx == id) return ompt_info::name; + + if constexpr(sizeof...(OpIdxTail) > 0) + return name_by_id(id, std::index_sequence{}); + else + return nullptr; +} + +bool +should_enable_callback(rocprofiler_callback_tracing_kind_t _callback_domain, + rocprofiler_buffer_tracing_kind_t _buffered_domain, + int _operation) +{ + // we loop over all the *registered* contexts and see if any of them, at any point in time, + // might require callback or buffered API tracing + for(const auto& itr : context::get_registered_contexts()) + { + if(!itr) continue; + + // if there is a callback tracer enabled for the given domain and op, we need to wrap + if(itr->callback_tracer && itr->callback_tracer->domains(_callback_domain) && + itr->callback_tracer->domains(_callback_domain, _operation)) + return true; + + // if there is a buffered tracer enabled for the given domain and op, we need to wrap + if(itr->buffered_tracer && itr->buffered_tracer->domains(_buffered_domain) && + itr->buffered_tracer->domains(_buffered_domain, _operation)) + return true; + } + return false; +} + +template +void +update_table(ompt_update_func f, std::integral_constant) +{ + auto _info = ompt_info(); + + if(_info.unsupported) + { + ROCP_INFO << "OMPT operation not supported: " << _info.name; + return; + } + // check to see if there are any contexts which enable this operation in the OMPT API domain + if(!should_enable_callback( + _info.callback_domain_idx, _info.buffered_domain_idx, _info.operation_idx)) + return; + + ROCP_TRACE << "updating table entry for " << _info.name; + + // Register this callback for OMPT at init time. + auto& _func = _info.get_table_func(); + auto* _ompt_cb = reinterpret_cast(&_func); + f(_info.name, _ompt_cb, _info.ompt_idx); +} + +template +void +update_table(ompt_update_func f, std::index_sequence) +{ + update_table(f, std::integral_constant{}); + if constexpr(sizeof...(OpIdxTail) > 0) update_table(f, std::index_sequence{}); +} +} // namespace + +template +template +void +ompt_impl::set_data_args(DataArgsT& _data_args, Args... args) +{ + if constexpr(sizeof...(Args) == 0) + _data_args.no_args.empty = '\0'; + else + _data_args = DataArgsT{convert_arg(args)...}; +} + +// check out the assembly here... this compiles to a switch statement +const char* +name_by_id(uint32_t id) +{ + return name_by_id(id, std::make_index_sequence{}); +} + +std::vector +get_ids() +{ + constexpr auto last_api_id = ompt_domain_info::last; + auto _data = std::vector{}; + _data.reserve(last_api_id); + get_ids(_data, std::make_index_sequence{}); + return _data; +} + +template +void +iterate_args(const uint32_t id, + const DataT& data, + rocprofiler_callback_tracing_operation_args_cb_t func, + int32_t max_deref, + void* user_data, + std::index_sequence) +{ + if(OpIdx == id) + { + using info_type = ompt_info; + auto&& arg_list = info_type::as_arg_list(data, max_deref); + auto&& arg_addr = info_type::as_arg_addr(data); + for(size_t i = 0; i < std::min(arg_list.size(), arg_addr.size()); ++i) + { + auto ret = func(info_type::callback_domain_idx, // kind + id, // operation + i, // arg_number + arg_addr.at(i), // arg_value_addr + arg_list.at(i).indirection_level, // indirection + arg_list.at(i).type, // arg_type + arg_list.at(i).name, // arg_name + arg_list.at(i).value.c_str(), // arg_value_str + arg_list.at(i).dereference_count, // num deref in str + user_data); + if(ret != 0) break; + } + return; + } + if constexpr(sizeof...(OpIdxTail) > 0) + iterate_args(id, data, func, max_deref, user_data, std::index_sequence{}); +} + +void +update_callback(rocprofiler_ompt_callback_functions_t& cb_functions) +{ + auto _info = ompt_info(); + + if(should_enable_callback( + _info.callback_domain_idx, _info.buffered_domain_idx, _info.operation_idx)) + rocprof_ompt_cb_interface(cb_functions); +} + +void +iterate_args(uint32_t id, + const rocprofiler_callback_tracing_ompt_data_t& data, + rocprofiler_callback_tracing_operation_args_cb_t callback, + int32_t max_deref, + void* user_data) +{ + if(callback) + iterate_args(id, + data, + callback, + max_deref, + user_data, + std::make_index_sequence{}); +} + +void +update_table(ompt_update_func f) +{ + update_table(f, std::make_index_sequence{}); +} + +} // namespace ompt +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.def.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.def.cpp new file mode 100644 index 0000000000..0cb1287fb8 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.def.cpp @@ -0,0 +1,73 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#if defined(ROCPROFILER_LIB_ROCPROFILER_OMPT_OMPT_CPP_IMPL) && \ + ROCPROFILER_LIB_ROCPROFILER_OMPT_OMPT_CPP_IMPL == 1 + +# include "lib/common/mpl.hpp" +# include "lib/rocprofiler-sdk/ompt/defines.hpp" +# include "lib/rocprofiler-sdk/ompt/ompt.hpp" +# include "lib/rocprofiler-sdk/ompt/utils.hpp" + +# include +# include + +# include + +// clang-format off +OMPT_INFO_DEFINITION_V("omp_thread_begin", ompt_callback_thread_begin, ROCPROFILER_OMPT_ID_thread_begin, ompt_callback_thread_begin_t, thread_begin, ompt_thread_begin_fn, thread_type, thread_data) +OMPT_INFO_DEFINITION_V("omp_thread_end", ompt_callback_thread_end, ROCPROFILER_OMPT_ID_thread_end, ompt_callback_thread_end_t, thread_end, ompt_thread_end_fn, thread_data) +OMPT_INFO_DEFINITION_V("omp_parallel_begin", ompt_callback_parallel_begin, ROCPROFILER_OMPT_ID_parallel_begin, ompt_callback_parallel_begin_t, parallel_begin, ompt_parallel_begin_fn, encountering_task_data, encountering_task_frame, parallel_data, requested_parallelism, flags, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_parallel_end", ompt_callback_parallel_end, ROCPROFILER_OMPT_ID_parallel_end, ompt_callback_parallel_end_t, parallel_end, ompt_parallel_end_fn, parallel_data, encountering_task_data, flags, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_task_create", ompt_callback_task_create, ROCPROFILER_OMPT_ID_task_create, ompt_callback_task_create_t, task_create, ompt_task_create_fn, encountering_task_data, encountering_task_frame, new_task_data, flags, has_dependences, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_task_schedule", ompt_callback_task_schedule, ROCPROFILER_OMPT_ID_task_schedule, ompt_callback_task_schedule_t, task_schedule, ompt_task_schedule_fn, prior_task_data, prior_task_status, next_task_data) +OMPT_INFO_DEFINITION_V("omp_implicit_task", ompt_callback_implicit_task, ROCPROFILER_OMPT_ID_implicit_task, ompt_callback_implicit_task_t, implicit_task, ompt_implicit_task_fn, endpoint, parallel_data, task_data, actual_parallelism, index, flags) +OMPT_INFO_DEFINITION_V("omp_device_initialize", ompt_callback_device_initialize, ROCPROFILER_OMPT_ID_device_initialize, ompt_callback_device_initialize_t, device_initialize, ompt_device_initialize_fn, device_num, type, device, lookup, documentation) +OMPT_INFO_DEFINITION_V("omp_device_finalize", ompt_callback_device_finalize, ROCPROFILER_OMPT_ID_device_finalize, ompt_callback_device_finalize_t, device_finalize, ompt_device_finalize_fn, device_num) +OMPT_INFO_DEFINITION_V("omp_device_load", ompt_callback_device_load, ROCPROFILER_OMPT_ID_device_load, ompt_callback_device_load_t, device_load, ompt_device_load_fn, device_num, filename, offset_in_file, vma_in_file, bytes, host_addr, device_addr, module_id) +// OMPT_INFO_DEFINITION_V("omp_device_unload", ompt_callback_device_unload, ROCPROFILER_OMPT_ID_device_unload, ompt_callback_device_unload_t, device_unload, ompt_device_unload_fn, device_num, module_id) +OMPT_INFO_DEFINITION_V("omp_sync_region_wait", ompt_callback_sync_region_wait, ROCPROFILER_OMPT_ID_sync_region_wait, ompt_callback_sync_region_wait_t, sync_region_wait, ompt_sync_region_wait_fn, kind, endpoint, parallel_data, task_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_mutex_released", ompt_callback_mutex_released, ROCPROFILER_OMPT_ID_mutex_released, ompt_callback_mutex_released_t, mutex_released, ompt_mutex_released_fn, kind, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_dependences", ompt_callback_dependences, ROCPROFILER_OMPT_ID_dependences, ompt_callback_dependences_t, dependences, ompt_dependences_fn, task_data, deps, ndeps) +OMPT_INFO_DEFINITION_V("omp_task_dependence", ompt_callback_task_dependence, ROCPROFILER_OMPT_ID_task_dependence, ompt_callback_task_dependence_t, task_dependence, ompt_task_dependence_fn, src_task_data, sink_task_data) +OMPT_INFO_DEFINITION_V("omp_work", ompt_callback_work, ROCPROFILER_OMPT_ID_work, ompt_callback_work_t, work, ompt_work_fn, work_type, endpoint, parallel_data, task_data, count, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_masked", ompt_callback_masked, ROCPROFILER_OMPT_ID_masked, ompt_callback_masked_t, masked, ompt_masked_fn, endpoint, parallel_data, task_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_sync_region", ompt_callback_sync_region, ROCPROFILER_OMPT_ID_sync_region, ompt_callback_sync_region_t, sync_region, ompt_sync_region_fn, kind, endpoint, parallel_data, task_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_lock_init", ompt_callback_lock_init, ROCPROFILER_OMPT_ID_lock_init, ompt_callback_lock_init_t, lock_init, ompt_lock_init_fn, kind, hint, impl, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_lock_destroy", ompt_callback_lock_destroy, ROCPROFILER_OMPT_ID_lock_destroy, ompt_callback_lock_destroy_t, lock_destroy, ompt_lock_destroy_fn, kind, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_mutex_acquire", ompt_callback_mutex_acquire, ROCPROFILER_OMPT_ID_mutex_acquire, ompt_callback_mutex_acquire_t, mutex_acquire, ompt_mutex_acquire_fn, kind, hint, impl, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_mutex_acquired", ompt_callback_mutex_acquired, ROCPROFILER_OMPT_ID_mutex_acquired, ompt_callback_mutex_acquired_t, mutex_acquired, ompt_mutex_acquired_fn, kind, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_nest_lock", ompt_callback_nest_lock, ROCPROFILER_OMPT_ID_nest_lock, ompt_callback_nest_lock_t, nest_lock, ompt_nest_lock_fn, endpoint, wait_id, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_flush", ompt_callback_flush, ROCPROFILER_OMPT_ID_flush, ompt_callback_flush_t, flush, ompt_flush_fn, thread_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_cancel", ompt_callback_cancel, ROCPROFILER_OMPT_ID_cancel, ompt_callback_cancel_t, cancel, ompt_cancel_fn, task_data, flags, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_reduction", ompt_callback_reduction, ROCPROFILER_OMPT_ID_reduction, ompt_callback_reduction_t, reduction, ompt_reduction_fn, kind, endpoint, parallel_data, task_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_dispatch", ompt_callback_dispatch, ROCPROFILER_OMPT_ID_dispatch, ompt_callback_dispatch_t, dispatch, ompt_dispatch_fn, parallel_data, task_data, kind, instance) +OMPT_INFO_DEFINITION_V("omp_target_emi", ompt_callback_target_emi, ROCPROFILER_OMPT_ID_target_emi, ompt_callback_target_emi_t, target_emi, ompt_target_emi_fn, kind, endpoint, device_num, task_data, target_task_data, target_data, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_target_data_op_emi", ompt_callback_target_data_op_emi, ROCPROFILER_OMPT_ID_target_data_op_emi, ompt_callback_target_data_op_emi_t, target_data_op_emi, ompt_target_data_op_emi_fn, endpoint, target_task_data, target_data, host_op_id, optype, src_address, src_device_num, dst_address, dst_device_num, bytes, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_target_submit_emi", ompt_callback_target_submit_emi, ROCPROFILER_OMPT_ID_target_submit_emi, ompt_callback_target_submit_emi_t, target_submit_emi, ompt_target_submit_emi_fn, endpoint, target_data, host_op_id, requested_num_teams) +// OMPT_INFO_DEFINITION_V("omp_target_map_emi", ompt_callback_target_map_emi, ROCPROFILER_OMPT_ID_target_map_emi, ompt_callback_target_map_emi_t, target_map_emi, ompt_target_map_emi_fn, nitems, host_addr, device_addr, bytes, mapping_flags, codeptr_ra) +OMPT_INFO_DEFINITION_V("omp_error", ompt_callback_error, ROCPROFILER_OMPT_ID_error, ompt_callback_error_t, error, ompt_error_fn, severity, message, length, codeptr_ra) +// clang-format on + +#else +# error "Do not compile this file directly. It is included by lib/rocprofiler-sdk/ompt/ompt.cpp" +#endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.hpp new file mode 100644 index 0000000000..91341cbd96 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/ompt.hpp @@ -0,0 +1,200 @@ +// mit license +// +// copyright (c) 2023 advanced micro devices, inc. all rights reserved. +// +// permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "software"), to deal +// in the software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the software, and to permit persons to whom the software is +// furnished to do so, subject to the following conditions: +// +// the above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the software. +// +// the software is provided "as is", without warranty of any kind, express or +// implied, including but not limited to the warranties of merchantability, +// fitness for a particular purpose and noninfringement. in no event shall the +// authors or copyright holders be liable for any claim, damages or other +// liability, whether in an action of contract, tort or otherwise, arising from, +// out of or in connection with the software or the use or other dealings in +// the software. + +#pragma once + +#include "lib/rocprofiler-sdk/context/correlation_id.hpp" +#include "lib/rocprofiler-sdk/tracing/fwd.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace rocprofiler +{ +namespace ompt +{ +struct ompt_table +{ + ompt_callback_thread_begin_t ompt_thread_begin_fn = nullptr; + ompt_callback_thread_end_t ompt_thread_end_fn = nullptr; + ompt_callback_parallel_begin_t ompt_parallel_begin_fn = nullptr; + ompt_callback_parallel_end_t ompt_parallel_end_fn = nullptr; + ompt_callback_task_create_t ompt_task_create_fn = nullptr; + ompt_callback_task_schedule_t ompt_task_schedule_fn = nullptr; + ompt_callback_implicit_task_t ompt_implicit_task_fn = nullptr; + ompt_callback_device_initialize_t ompt_device_initialize_fn = nullptr; + ompt_callback_device_finalize_t ompt_device_finalize_fn = nullptr; + ompt_callback_device_load_t ompt_device_load_fn = nullptr; + // ompt_callback_device_unload_t ompt_device_unload_fn = nullptr; + ompt_callback_sync_region_t ompt_sync_region_wait_fn = nullptr; + ompt_callback_mutex_t ompt_mutex_released_fn = nullptr; + ompt_callback_dependences_t ompt_dependences_fn = nullptr; + ompt_callback_task_dependence_t ompt_task_dependence_fn = nullptr; + ompt_callback_work_t ompt_work_fn = nullptr; + ompt_callback_masked_t ompt_masked_fn = nullptr; + ompt_callback_target_map_t ompt_target_map_fn = nullptr; + ompt_callback_sync_region_t ompt_sync_region_fn = nullptr; + ompt_callback_mutex_acquire_t ompt_lock_init_fn = nullptr; + ompt_callback_mutex_t ompt_lock_destroy_fn = nullptr; + ompt_callback_mutex_acquire_t ompt_mutex_acquire_fn = nullptr; + ompt_callback_mutex_t ompt_mutex_acquired_fn = nullptr; + ompt_callback_nest_lock_t ompt_nest_lock_fn = nullptr; + ompt_callback_flush_t ompt_flush_fn = nullptr; + ompt_callback_cancel_t ompt_cancel_fn = nullptr; + ompt_callback_sync_region_t ompt_reduction_fn = nullptr; + ompt_callback_dispatch_t ompt_dispatch_fn = nullptr; + ompt_callback_target_emi_t ompt_target_emi_fn = nullptr; + ompt_callback_target_data_op_emi_t ompt_target_data_op_emi_fn = nullptr; + ompt_callback_target_submit_emi_t ompt_target_submit_emi_fn = nullptr; + // ompt_callback_target_map_emi_t ompt_target_map_emi_fn = nullptr; + ompt_callback_error_t ompt_error_fn = nullptr; +}; + +struct ompt_domain_info +{ + using args_type = rocprofiler_ompt_args_t; + using retval_type = void; + using callback_data_type = rocprofiler_callback_tracing_ompt_data_t; + using buffer_data_type = rocprofiler_buffer_tracing_ompt_record_t; + using enum_type = rocprofiler_ompt_operation_t; + static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_OMPT; + static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_OMPT; + static constexpr auto none = ROCPROFILER_OMPT_ID_NONE; + static constexpr auto last = ROCPROFILER_OMPT_ID_LAST; + static constexpr auto ompt_last = ROCPROFILER_OMPT_ID_callback_functions; + static constexpr auto external_correlation_id_domain_idx = + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_OMPT; +}; + +using buffer_ompt_record_t = rocprofiler_buffer_tracing_ompt_record_t; +using callback_ompt_data_t = rocprofiler_callback_tracing_ompt_data_t; + +// save state for ompt between callbacks +struct ompt_save_state +{ + uint64_t thr_id; // thread this was created on + uint64_t start_timestamp; // timestamp when it was created + rocprofiler_ompt_operation_t operation_idx; // for error checking + context::correlation_id* corr_id; // correlation id + tracing::external_correlation_id_map_t external_corr_ids; + tracing::callback_context_data_vec_t callback_contexts; + tracing::buffered_context_data_vec_t buffered_contexts; +}; + +// proxy for ompt_data_t pointers received from OMPT callbacks +// SDK gets index 0, client get index 1 +class ompt_data_proxy +{ +public: + ompt_data_t* get_client_ptr(ompt_data_t* ompt_ptr) { return get<1>(ompt_ptr); } + ompt_data_t* get_internal_ptr(ompt_data_t* ompt_ptr) { return get<0>(ompt_ptr); } + +private: + struct proxy_ptrs + { + std::array v; + }; + + // get the proxy pointer for idx. If ompt_ptr->ptr is 0, + // allocate a new proxy struct and assign it to ompt_ptr->ptr + // return the address of the requested proxy element, + // or null if ompt_ptr is null + template + ompt_data_t* get(ompt_data_t* ompt_ptr) + { + static constexpr proxy_ptrs nulval{}; + if(ompt_ptr == nullptr) return nullptr; + if(ompt_ptr->ptr == nullptr) + { + std::lock_guard lk(m); + ompt_ptr->ptr = &proxies.emplace_back(nulval); + } + auto* ptr = static_cast(ompt_ptr->ptr); + return &(ptr->v[idx]); + } + + std::deque proxies; + std::mutex m; +}; + +// function to return client pointer for use outside +ompt_data_t* +proxy_data_ptr(ompt_data_t* real_ptr); + +struct ompt_task_save_state +{ + context::correlation_id* corr_id; + int task_flags; +}; + +template +struct ompt_impl +{ + template + static void set_data_args(DataArgst&, Args... args); + + template + static void begin(ompt_data_t* data, Args... args); + + template + static void end(ompt_data_t* data, Args... args); + + template + static context::correlation_id* event_common(Args... args); + + template + static void event(Args&&... args); +}; + +template +struct ompt_info; + +const char* +name_by_id(uint32_t id); + +std::vector +get_ids(); + +void +iterate_args(uint32_t id, + const rocprofiler_callback_tracing_ompt_data_t& data, + rocprofiler_callback_tracing_operation_args_cb_t callback, + int32_t max_deref, + void* user_data); + +using ompt_update_func = void (*)(const char* cbname, ompt_callback_t* cbf, int cbnum); +void update_table(ompt_update_func); + +void +update_callback(rocprofiler_ompt_callback_functions_t& cb_functions); + +} // namespace ompt +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/utils.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/utils.hpp new file mode 100644 index 0000000000..72b4144902 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/ompt/utils.hpp @@ -0,0 +1,72 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/stringize_arg.hpp" +#include "lib/rocprofiler-sdk/ompt/details/format.hpp" + +#include +#include + +#include +#include + +#include +#include + +namespace rocprofiler +{ +namespace ompt +{ +namespace utils +{ +template +auto +stringize_impl(const Tp& _v) +{ + using value_type = std::decay_t; + + if constexpr(fmt::is_formattable::value && !std::is_pointer::value) + { + return fmt::format("{}", _v); + } + else + { + auto _ss = std::stringstream{}; + _ss << _v; + return _ss.str(); + } +} + +template +auto +stringize(int32_t max_deref, Args... args) +{ + using array_type = common::stringified_argument_array_t; + (void) max_deref; + return array_type{common::stringize_arg( + max_deref, args, [](const auto& _v) { return stringize_impl(_v); })...}; +} +} // namespace utils +} // namespace ompt +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/CMakeLists.txt deleted file mode 100644 index d55023f459..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# -# -set(ROCPROFILER_LIB_OPENMP_SOURCES openmp.cpp) -set(ROCPROFILER_LIB_OPENMP_HEADERS defines.hpp openmp.hpp utils.hpp) - -target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_OPENMP_SOURCES}) - -add_subdirectory(details) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/CMakeLists.txt deleted file mode 100644 index c919310c2e..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# -# -# -set(ROCPROFILER_LIB_OPENMP_DETAILS_SOURCES) -set(ROCPROFILER_LIB_OPENMP_DETAILS_HEADERS format.hpp) - -target_sources( - rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_OPENMP_DETAILS_SOURCES} - ${ROCPROFILER_LIB_OPENMP_DETAILS_HEADERS}) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/format.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/format.hpp deleted file mode 100644 index a7047b770c..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/openmp/details/format.hpp +++ /dev/null @@ -1,279 +0,0 @@ -// MIT License -// -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#pragma once - -#include - -#include "fmt/core.h" - -#define ROCP_SDK_OPENMP_FORMATTER(TYPE, ...) \ - template <> \ - struct formatter : rocprofiler::openmp::details::base_formatter \ - { \ - template \ - auto format(const TYPE& v, Ctx& ctx) const \ - { \ - return fmt::format_to(ctx.out(), __VA_ARGS__); \ - } \ - }; - -#define ROCP_SDK_OPENMP_FORMAT_CASE_STMT(PREFIX, SUFFIX) \ - case PREFIX##_##SUFFIX: return fmt::format_to(ctx.out(), #SUFFIX) - -namespace rocprofiler -{ -namespace openmp -{ -namespace details -{ -struct base_formatter -{ - template - constexpr auto parse(ParseContext& ctx) - { - return ctx.begin(); - } -}; -} // namespace details -} // namespace openmp -} // namespace rocprofiler - -namespace fmt -{ -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_set_result_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, error); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, never); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, impossible); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, sometimes); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, sometimes_paired); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_set, always); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_thread_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_thread, initial); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_thread, worker); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_thread, other); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt_thread, unknown); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_scope_endpoint_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, scope_begin); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, scope_end); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, scope_beginend); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_dispatch_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, dispatch_iteration); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, dispatch_section); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, dispatch_ws_loop_chunk); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, dispatch_taskloop_chunk); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, dispatch_distribute_chunk); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_sync_region_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_explicit); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_implementation); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_taskwait); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_taskgroup); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_reduction); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit_workshare); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_implicit_parallel); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, sync_region_barrier_teams); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(ompt_target_data_op_t v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_alloc); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_transfer_to_device); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_transfer_from_device); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_delete); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_associate); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_disassociate); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_alloc_async); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_transfer_to_device_async); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_transfer_from_device_async); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, target_data_delete_async); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_data_t& v, Ctx& ctx) const - { - return fmt::format_to(ctx.out(), "{}", v.value); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_work_t& v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_loop); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_sections); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_single_executor); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_single_other); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_workshare); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_distribute); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_taskloop); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_scope); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_loop_static); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_loop_dynamic); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_loop_guided); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, work_loop_other); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_task_status_t& v, Ctx& ctx) const - { - switch(v) - { - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_complete); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_yield); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_cancel); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_detach); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_early_fulfill); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_late_fulfill); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, task_switch); - ROCP_SDK_OPENMP_FORMAT_CASE_STMT(ompt, taskwait_complete); - } - return fmt::format_to(ctx.out(), "Unknown"); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_frame_t& v, Ctx& ctx) const - { - return fmt::format_to( - ctx.out(), - "{}exit_frame={}, enter_frame={}, exit_frame_flags={}, enter_frame_flags={}{}", - '{', - v.exit_frame, - v.enter_frame, - v.exit_frame_flags, - v.enter_frame_flags, - '}'); - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_dependence_t& v, Ctx& ctx) const - { - // stub - return fmt::format_to(ctx.out(), "(dependence)"); - (void) v; - } -}; - -template <> -struct formatter : rocprofiler::openmp::details::base_formatter -{ - template - auto format(const ompt_dispatch_chunk_t& v, Ctx& ctx) const - { - return fmt::format_to( - ctx.out(), "{}start={}, iterations={}{}", '{', v.start, v.iterations, '}'); - } -}; -} // namespace fmt - -#undef ROCP_SDK_OPENMP_FORMATTER -#undef ROCP_SDK_OPENMP_OSTREAM_FORMATTER -#undef ROCP_SDK_OPENMP_FORMAT_CASE_STMT diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index 9b26f3b9e8..1784f61ec3 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -41,6 +41,7 @@ #include "lib/rocprofiler-sdk/intercept_table.hpp" #include "lib/rocprofiler-sdk/internal_threading.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" +#include "lib/rocprofiler-sdk/ompt.hpp" #include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" #include "lib/rocprofiler-sdk/pc_sampling/code_object.hpp" #include "lib/rocprofiler-sdk/pc_sampling/service.hpp" @@ -51,6 +52,7 @@ #include #include #include +#include #include #include @@ -639,6 +641,7 @@ finalize() counters::device_counting_service_finalize(); hsa::queue_controller_fini(); thread_trace::finalize(); + ompt::finalize_ompt(); page_migration::finalize(); #if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0 // WARNING: this must precede `code_object::finalize()` diff --git a/projects/rocprofiler-sdk/source/scripts/thread-sanitizer-suppr.txt b/projects/rocprofiler-sdk/source/scripts/thread-sanitizer-suppr.txt index ea9a4bf299..83d4dcd368 100644 --- a/projects/rocprofiler-sdk/source/scripts/thread-sanitizer-suppr.txt +++ b/projects/rocprofiler-sdk/source/scripts/thread-sanitizer-suppr.txt @@ -11,6 +11,10 @@ race:libamdhip64.so # data race arising from hsa runtime race:libhsa-runtime64.so +# data race(s) arising from OpenMP +race:__kmp_resume_template +race:__kmp_suspend_initialize_thread + # unlock of an unlocked mutex (or by a wrong thread) mutex:libhsa-runtime64.so diff --git a/projects/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/tests/CMakeLists.txt index d3e914ff90..c67a82cb20 100644 --- a/projects/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/CMakeLists.txt @@ -61,7 +61,8 @@ add_subdirectory(page-migration) add_subdirectory(pc_sampling) add_subdirectory(thread-trace) add_subdirectory(hip-graph-tracing) +add_subdirectory(counter-collection) +add_subdirectory(openmp-tools) # rocprofv3 validation tests add_subdirectory(rocprofv3) -add_subdirectory(counter-collection) diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index f24d95fcb3..d1dab34b44 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -12,6 +12,7 @@ set(CMAKE_BUILD_RPATH # applications used by integration tests which DO link to rocprofiler-sdk-roctx add_subdirectory(reproducible-runtime) add_subdirectory(transpose) +add_subdirectory(openmp) set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") diff --git a/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt new file mode 100644 index 0000000000..e6a85849d0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt @@ -0,0 +1,41 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT OMP_TARGET_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(OMP_TARGET_COMPILER + "${amdclangpp_EXECUTABLE}" + CACHE FILEPATH "") + endif() +endif() + +project(rocprofiler-tests-bin-openmp LANGUAGES CXX) + +find_package(rocprofiler-sdk REQUIRED) + +set(DEFAULT_GPU_TARGETS + "gfx906" + "gfx908" + "gfx90a" + "gfx940" + "gfx941" + "gfx942" + "gfx1100" + "gfx1101" + "gfx1102") + +set(OPENMP_GPU_TARGETS + "${DEFAULT_GPU_TARGETS}" + CACHE STRING "GPU targets to compile for") + +add_subdirectory(target) diff --git a/projects/rocprofiler-sdk/tests/bin/openmp/target/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/openmp/target/CMakeLists.txt new file mode 100644 index 0000000000..f855c63749 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/openmp/target/CMakeLists.txt @@ -0,0 +1,23 @@ +# +# +# + +set(CMAKE_BUILD_TYPE "RelWithDebInfo") + +find_package(Threads REQUIRED) +find_package(rocprofiler-sdk-roctx REQUIRED) + +add_executable(openmp-target) +target_sources(openmp-target PRIVATE openmp-target.cpp) +target_link_libraries(openmp-target PRIVATE Threads::Threads + rocprofiler-sdk-roctx::rocprofiler-sdk-roctx) +target_compile_options(openmp-target PRIVATE -fopenmp) +target_link_options(openmp-target PRIVATE -fopenmp) + +foreach(_TARGET ${OPENMP_GPU_TARGETS}) + target_compile_options(openmp-target PRIVATE --offload-arch=${_TARGET}) + target_link_options(openmp-target PRIVATE --offload-arch=${_TARGET}) +endforeach() + +include(rocprofiler-sdk-custom-compilation) +rocprofiler_sdk_custom_compilation(TARGET openmp-target COMPILER ${OMP_TARGET_COMPILER}) diff --git a/projects/rocprofiler-sdk/tests/bin/openmp/target/openmp-target.cpp b/projects/rocprofiler-sdk/tests/bin/openmp/target/openmp-target.cpp new file mode 100644 index 0000000000..ddcdb383a5 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/openmp/target/openmp-target.cpp @@ -0,0 +1,155 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include + +#include +#include + +constexpr float EPS_FLOAT = 1.0e-7f; +constexpr double EPS_DOUBLE = 1.0e-15; + +#pragma omp declare target +template +T +mul(T a, T b) +{ + volatile T c = a * b; + return c; +} +#pragma omp end declare target + +template +void +vmul(T* a, T* b, T* c, int N) +{ +#pragma omp target map(to : a [0:N], b [0:N]) map(from : c [0:N]) +#pragma omp teams distribute parallel for + for(int i = 0; i < N; i++) + { + for(int j = 0; j < 100000; ++j) + c[i] = mul(a[i], b[i]); + } +} + +int +main() +{ + auto range_id = roctxRangeStart("main"); + + constexpr int N = 100000; + int a_i[N], b_i[N], c_i[N], validate_i[N]; + float a_f[N], b_f[N], c_f[N], validate_f[N]; + double a_d[N], b_d[N], c_d[N], validate_d[N]; + int N_errors = 0; + bool flag = false; + + roctxMark("initialization"); + +#pragma omp parallel for + for(int i = 0; i < N; ++i) + { + a_f[i] = a_i[i] = i + 1; + b_f[i] = b_i[i] = i + 2; + a_d[i] = a_i[i]; + b_d[i] = b_i[i]; + validate_i[i] = a_i[i] * b_i[i]; + validate_f[i] = a_f[i] * b_f[i]; + validate_d[i] = a_d[i] * b_d[i]; + } + + vmul(a_i, b_i, c_i, N); + vmul(a_f, b_f, c_f, N); + + auto tid = roctx_thread_id_t{}; + // get the thread id recognized by rocprofiler-sdk from roctx + roctxGetThreadId(&tid); + // pause API tracing + roctxProfilerPause(tid); + + // we don't expect to see the third vmul + vmul(a_d, b_d, c_d, N); + + // resume API tracing + roctxProfilerResume(tid); + + for(int i = 0; i < N; i++) + { + if(c_i[i] != validate_i[i]) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf( + "First fail: c_i[%d](%d) != validate_i[%d](%d)\n", i, c_i[i], i, validate_i[i]); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_f[i] - validate_f[i]) > EPS_FLOAT) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf("First fail: c_f[%d](%f) != validate_f[%d](%f)\n", + i, + static_cast(c_f[i]), + i, + static_cast(validate_f[i])); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_d[i] - validate_d[i]) > EPS_DOUBLE) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf( + "First fail: c_d[%d](%f) != validate_d[%d](%f)\n", i, c_d[i], i, validate_d[i]); + flag = true; + } + } + } + if(N_errors == 0) + { + printf("Success\n"); + return 0; + } + else + { + printf("Total %d failures\n", N_errors); + printf("Fail\n"); + return 1; + } + + roctxRangeStop(range_id); +} diff --git a/projects/rocprofiler-sdk/tests/openmp-tools/CMakeLists.txt b/projects/rocprofiler-sdk/tests/openmp-tools/CMakeLists.txt new file mode 100644 index 0000000000..15ca9c35b2 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/openmp-tools/CMakeLists.txt @@ -0,0 +1,75 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-openmp-tools + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +set(PYTEST_ARGS) +if(ROCPROFILER_MEMCHECK MATCHES "(Address|Thread)Sanitizer" OR ROCPROFILER_BUILD_CODECOV) + set(PYTEST_ARGS -k "not test_total_runtime") +endif() + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +if(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer") + set(IS_THREAD_SANITIZER ON) +else() + set(IS_THREAD_SANITIZER OFF) +endif() + +add_test(NAME test-openmp-tools-execute COMMAND $) + +set(openmp-tools-env + "${PRELOAD_ENV}" + "OMP_NUM_THREADS=2" + "OMP_DISPLAY_ENV=1" + "OMP_TARGET_OFFLOAD=mandatory" + "ROCR_VISIBLE_DEVICES=0" + "ROCPROFILER_TOOL_OUTPUT_FILE=openmp-tools-test.json" + "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}" + ) + +set_tests_properties( + test-openmp-tools-execute + PROPERTIES TIMEOUT + 100 + LABELS + "integration-tests;openmp-target" + ENVIRONMENT + "${openmp-tools-env}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + "${IS_THREAD_SANITIZER}") + +# copy to binary directory +rocprofiler_configure_pytest_files(COPY validate.py conftest.py CONFIG pytest.ini) + +add_test( + NAME test-openmp-tools-validate + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py ${PYTEST_ARGS} + --input ${CMAKE_CURRENT_BINARY_DIR}/openmp-tools-test.json) + +set_tests_properties( + test-openmp-tools-validate + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;openmp-target" + DEPENDS + test-openmp-tools-execute + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + "${IS_THREAD_SANITIZER}") diff --git a/projects/rocprofiler-sdk/tests/openmp-tools/conftest.py b/projects/rocprofiler-sdk/tests/openmp-tools/conftest.py new file mode 100644 index 0000000000..fd1d59b7f4 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/openmp-tools/conftest.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import json +import pytest + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict + + +def pytest_addoption(parser): + parser.addoption( + "--input", + action="store", + default="openmp-tools-test.json", + help="Input JSON", + ) + + +@pytest.fixture +def input_data(request): + filename = request.config.getoption("--input") + with open(filename, "r") as inp: + return dotdict(json.load(inp)) diff --git a/projects/rocprofiler-sdk/tests/openmp-tools/pytest.ini b/projects/rocprofiler-sdk/tests/openmp-tools/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/openmp-tools/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/openmp-tools/validate.py b/projects/rocprofiler-sdk/tests/openmp-tools/validate.py new file mode 100644 index 0000000000..a5ba54af56 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/openmp-tools/validate.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 + +import sys +import pytest + + +# helper function +def node_exists(name, data, min_len=1): + assert name in data + assert data[name] is not None + if isinstance(data[name], (list, tuple, dict, set)): + assert len(data[name]) >= min_len + + +def test_data_structure(input_data): + """verify minimum amount of expected data is present""" + data = input_data + + node_exists("rocprofiler-sdk-json-tool", data) + + sdk_data = data["rocprofiler-sdk-json-tool"] + + node_exists("metadata", sdk_data) + node_exists("pid", sdk_data["metadata"]) + node_exists("main_tid", sdk_data["metadata"]) + node_exists("init_time", sdk_data["metadata"]) + node_exists("fini_time", sdk_data["metadata"]) + + node_exists("agents", sdk_data) + node_exists("call_stack", sdk_data) + node_exists("callback_records", sdk_data) + node_exists("buffer_records", sdk_data) + + node_exists("names", sdk_data.callback_records) + node_exists("code_objects", sdk_data.callback_records) + node_exists("kernel_symbols", sdk_data.callback_records) + node_exists("hsa_api_traces", sdk_data.callback_records) + node_exists("hip_api_traces", sdk_data.callback_records, 0) + node_exists("marker_api_traces", sdk_data.callback_records) + node_exists("rccl_api_traces", sdk_data.callback_records, 0) + node_exists("ompt_traces", sdk_data.callback_records) + node_exists("kernel_dispatch", sdk_data.callback_records) + + node_exists("names", sdk_data.buffer_records) + node_exists("kernel_dispatch", sdk_data.buffer_records) + node_exists("memory_copies", sdk_data.buffer_records, 0) + node_exists("hsa_api_traces", sdk_data.buffer_records) + node_exists("hip_api_traces", sdk_data.buffer_records, 0) + node_exists("marker_api_traces", sdk_data.buffer_records) + node_exists("rccl_api_traces", sdk_data.buffer_records, 0) + node_exists("ompt_traces", sdk_data.buffer_records) + node_exists("retired_correlation_ids", sdk_data.buffer_records) + + +def test_size_entries(input_data): + # check that size fields are > 0 but account for function arguments + # which are named "size" + def check_size(data, bt): + if "size" in data.keys(): + if isinstance(data.size, str) and bt.endswith('["args"]'): + pass + else: + assert data.size > 0, f"origin: {bt}" + + # recursively check the entire data structure + def iterate_data(data, bt): + if isinstance(data, (list, tuple)): + for i, itr in enumerate(data): + if isinstance(itr, dict): + check_size(itr, f"{bt}[{i}]") + iterate_data(itr, f"{bt}[{i}]") + elif isinstance(data, dict): + check_size(data, f"{bt}") + for key, itr in data.items(): + iterate_data(itr, f'{bt}["{key}"]') + + # start recursive check over entire JSON dict + iterate_data(input_data, "input_data") + + +def test_timestamps(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + cb_start = {} + cb_end = {} + for titr in [ + "hsa_api_traces", + "marker_api_traces", + "hip_api_traces", + "rccl_api_traces", + "ompt_traces", + ]: + for itr in sdk_data.callback_records[titr]: + cid = itr.correlation_id.internal + phase = itr.phase + if phase == 1: + cb_start[cid] = itr.timestamp + elif phase == 2: + cb_end[cid] = itr.timestamp + assert cb_start[cid] <= itr.timestamp + + for itr in sdk_data.buffer_records[titr]: + assert itr.start_timestamp <= itr.end_timestamp + + for titr in ["kernel_dispatch", "memory_copies"]: + for itr in sdk_data.buffer_records[titr]: + assert itr.start_timestamp < itr.end_timestamp, f"[{titr}] {itr}" + assert itr.correlation_id.internal > 0, f"[{titr}] {itr}" + assert itr.correlation_id.external > 0, f"[{titr}] {itr}" + assert sdk_data.metadata.init_time < itr.start_timestamp, f"[{titr}] {itr}" + assert sdk_data.metadata.init_time < itr.end_timestamp, f"[{titr}] {itr}" + assert sdk_data.metadata.fini_time > itr.start_timestamp, f"[{titr}] {itr}" + assert sdk_data.metadata.fini_time > itr.end_timestamp, f"[{titr}] {itr}" + + api_start = cb_start[itr.correlation_id.internal] + # api_end = cb_end[itr.correlation_id.internal] + assert api_start < itr.start_timestamp, f"[{titr}] {itr}" + # assert api_end <= itr.end_timestamp, f"[{titr}] {itr}" + + +def test_total_runtime(input_data): + sdk_data = input_data["rocprofiler-sdk-json-tool"] + + runtime_data = [] + for itr in sdk_data.buffer_records.kernel_dispatch: + elapsed = itr.end_timestamp - itr.start_timestamp + runtime_data.append(elapsed) # in nanoseconds + + expected_runtime = 1.0e-6 # one millisecond + + assert sum(runtime_data) >= expected_runtime + + +def test_internal_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + api_corr_ids = [] + for titr in [ + "hsa_api_traces", + "marker_api_traces", + "hip_api_traces", + "rccl_api_traces", + "ompt_traces", + ]: + for itr in sdk_data.callback_records[titr]: + api_corr_ids.append(itr.correlation_id.internal) + + for itr in sdk_data.buffer_records[titr]: + api_corr_ids.append(itr.correlation_id.internal) + + api_corr_ids_sorted = sorted(api_corr_ids) + api_corr_ids_unique = list(set(api_corr_ids)) + + for itr in sdk_data.buffer_records.kernel_dispatch: + assert itr.correlation_id.internal in api_corr_ids_unique + + for itr in sdk_data.buffer_records.memory_copies: + assert itr.correlation_id.internal in api_corr_ids_unique + + len_corr_id_unq = len(api_corr_ids_unique) + assert len(api_corr_ids) != len_corr_id_unq + assert max(api_corr_ids_sorted) == len_corr_id_unq + + +def test_retired_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + def _sort_dict(inp): + return dict(sorted(inp.items())) + + api_corr_ids = {} + for titr in [ + "hsa_api_traces", + "marker_api_traces", + "hip_api_traces", + "rccl_api_traces", + "ompt_traces", + ]: + for itr in sdk_data.buffer_records[titr]: + corr_id = itr.correlation_id.internal + assert corr_id not in api_corr_ids.keys() + api_corr_ids[corr_id] = itr + + async_corr_ids = {} + for titr in ["kernel_dispatch", "memory_copies"]: + for itr in sdk_data.buffer_records[titr]: + corr_id = itr.correlation_id.internal + assert corr_id not in async_corr_ids.keys() + async_corr_ids[corr_id] = itr + + retired_corr_ids = {} + for itr in sdk_data.buffer_records.retired_correlation_ids: + corr_id = itr.internal_correlation_id + assert corr_id not in retired_corr_ids.keys() + retired_corr_ids[corr_id] = itr + + api_corr_ids = _sort_dict(api_corr_ids) + async_corr_ids = _sort_dict(async_corr_ids) + retired_corr_ids = _sort_dict(retired_corr_ids) + + for cid, itr in async_corr_ids.items(): + assert cid in retired_corr_ids.keys() + retired_ts = retired_corr_ids[cid].timestamp + end_ts = itr.end_timestamp + assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}" + + for cid, itr in api_corr_ids.items(): + assert cid in retired_corr_ids.keys() + retired_ts = retired_corr_ids[cid].timestamp + end_ts = itr.end_timestamp + assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}" + + assert len(api_corr_ids.keys()) == (len(retired_corr_ids.keys())) + + +def test_external_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + extern_corr_ids = [] + for titr in [ + "hsa_api_traces", + "marker_api_traces", + "hip_api_traces", + "rccl_api_traces", + "ompt_traces", + ]: + for itr in sdk_data.callback_records[titr]: + assert itr.correlation_id.external > 0 + assert itr.thread_id == itr.correlation_id.external + extern_corr_ids.append(itr.correlation_id.external) + + extern_corr_ids = list(set(sorted(extern_corr_ids))) + for titr in [ + "hsa_api_traces", + "marker_api_traces", + "hip_api_traces", + "rccl_api_traces", + "ompt_traces", + ]: + for itr in sdk_data.buffer_records[titr]: + assert itr.correlation_id.external > 0, f"[{titr}] {itr}" + assert itr.thread_id == itr.correlation_id.external, f"[{titr}] {itr}" + assert itr.thread_id in extern_corr_ids, f"[{titr}] {itr}" + assert itr.correlation_id.external in extern_corr_ids, f"[{titr}] {itr}" + + for titr in ["kernel_dispatch", "memory_copies"]: + for itr in sdk_data.buffer_records[titr]: + assert itr.correlation_id.external > 0, f"[{titr}] {itr}" + assert itr.correlation_id.external in extern_corr_ids, f"[{titr}] {itr}" + + +def test_kernel_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + symbol_info = {} + for itr in sdk_data.callback_records.kernel_symbols: + phase = itr.phase + payload = itr.payload + kern_id = payload.kernel_id + + assert phase == 1 or phase == 2 + assert kern_id > 0 + if phase == 1: + assert len(payload.kernel_name) > 0 + symbol_info[kern_id] = payload + elif phase == 2: + assert payload.kernel_id in symbol_info.keys() + assert payload.kernel_name == symbol_info[kern_id].kernel_name + + for itr in sdk_data.buffer_records.kernel_dispatch: + assert itr.dispatch_info.kernel_id in symbol_info.keys() + + for itr in sdk_data.callback_records.kernel_dispatch: + assert itr.payload.dispatch_info.kernel_id in symbol_info.keys() + + +def test_kernel_dispatch_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + num_dispatches = len(sdk_data.buffer_records.kernel_dispatch) + num_cb_dispatches = len(sdk_data.callback_records.kernel_dispatch) + + assert num_cb_dispatches == (3 * num_dispatches) + + bf_seq_ids = [] + for itr in sdk_data.buffer_records.kernel_dispatch: + bf_seq_ids.append(itr.dispatch_info.dispatch_id) + + cb_seq_ids = [] + for itr in sdk_data.callback_records.kernel_dispatch: + cb_seq_ids.append(itr.payload.dispatch_info.dispatch_id) + + bf_seq_ids = sorted(bf_seq_ids) + cb_seq_ids = sorted(cb_seq_ids) + + assert (3 * len(bf_seq_ids)) == len(cb_seq_ids) + + assert bf_seq_ids[0] == cb_seq_ids[0] + assert bf_seq_ids[-1] == cb_seq_ids[-1] + + def get_uniq(data): + return list(set(data)) + + bf_seq_ids_uniq = get_uniq(bf_seq_ids) + cb_seq_ids_uniq = get_uniq(cb_seq_ids) + + assert bf_seq_ids == bf_seq_ids_uniq + assert len(cb_seq_ids) == (3 * len(cb_seq_ids_uniq)) + assert len(bf_seq_ids) == num_dispatches + assert len(bf_seq_ids_uniq) == num_dispatches + assert len(cb_seq_ids_uniq) == num_dispatches + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index 8eb0c8af41..fe3bc10252 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -378,6 +378,23 @@ struct rccl_api_callback_record_t } }; +struct ompt_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_callback_tracing_ompt_data_t payload = {}; + callback_arg_array_t args = {}; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("timestamp", timestamp)); + cereal::save(ar, record); + ar(cereal::make_nvp("payload", payload)); + serialize_args(ar, args); + } +}; + struct kernel_dispatch_callback_record_t { uint64_t timestamp = 0; @@ -518,6 +535,7 @@ auto kernel_dispatch_cb_records = std::deque{}; auto memory_allocation_cb_records = std::deque{}; auto rccl_api_cb_records = std::deque{}; +auto ompt_cb_records = std::deque{}; int set_external_correlation_id(rocprofiler_thread_id_t thr_id, @@ -753,6 +771,18 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, rccl_api_cb_records.emplace_back( rccl_api_callback_record_t{ts, record, *data, std::move(args)}); } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_OMPT) + { + auto* data = static_cast(record.payload); + auto args = callback_arg_array_t{}; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, save_args, record.phase, &args); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; + ompt_cb_records.emplace_back(ompt_callback_record_t{ts, record, *data, std::move(args)}); + } else if(record.kind == ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION) { auto* data = static_cast( @@ -787,6 +817,7 @@ auto page_migration_records = std::deque{}; auto rccl_api_bf_records = std::deque{}; +auto ompt_bf_records = std::deque{}; void tool_tracing_buffered(rocprofiler_context_id_t /*context*/, @@ -898,6 +929,13 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, rccl_api_bf_records.emplace_back(*record); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_OMPT) + { + auto* record = + static_cast(header->payload); + + ompt_bf_records.emplace_back(*record); + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION) { auto* record = @@ -986,6 +1024,7 @@ rocprofiler_context_id_t hip_api_callback_ctx = {0}; rocprofiler_context_id_t marker_api_callback_ctx = {0}; rocprofiler_context_id_t code_object_ctx = {0}; rocprofiler_context_id_t rccl_api_callback_ctx = {0}; +rocprofiler_context_id_t ompt_callback_ctx = {0}; rocprofiler_context_id_t hsa_api_buffered_ctx = {0}; rocprofiler_context_id_t hip_api_buffered_ctx = {0}; rocprofiler_context_id_t marker_api_buffered_ctx = {0}; @@ -994,6 +1033,7 @@ rocprofiler_context_id_t memory_copy_buffered_ctx = {0}; rocprofiler_context_id_t memory_allocation_callback_ctx = {0}; rocprofiler_context_id_t memory_allocation_buffered_ctx = {0}; rocprofiler_context_id_t rccl_api_buffered_ctx = {0}; +rocprofiler_context_id_t ompt_buffered_ctx = {0}; rocprofiler_context_id_t counter_collection_ctx = {0}; rocprofiler_context_id_t scratch_memory_ctx = {0}; rocprofiler_context_id_t corr_id_retire_ctx = {0}; @@ -1015,6 +1055,7 @@ rocprofiler_buffer_id_t counter_collection_buffer = {}; rocprofiler_buffer_id_t scratch_memory_buffer = {}; rocprofiler_buffer_id_t corr_id_retire_buffer = {}; rocprofiler_buffer_id_t rccl_api_buffered_buffer = {}; +rocprofiler_buffer_id_t ompt_buffered_buffer = {}; auto contexts = std::unordered_map{ {"RUNTIME_INIT_CALLBACK", &runtime_init_callback_ctx}, @@ -1026,6 +1067,7 @@ auto contexts = std::unordered_map{ {"MEMORY_COPY_CALLBACK", &memory_copy_callback_ctx}, {"MEMORY_ALLOCATION_CALLBACK", &memory_allocation_callback_ctx}, {"RCCL_API_CALLBACK", &rccl_api_callback_ctx}, + {"OMPT_CALLBACK", &ompt_callback_ctx}, {"RUNTIME_INIT_BUFFERED", &runtime_init_buffered_ctx}, {"HSA_API_BUFFERED", &hsa_api_buffered_ctx}, {"HIP_API_BUFFERED", &hip_api_buffered_ctx}, @@ -1038,9 +1080,10 @@ auto contexts = std::unordered_map{ {"SCRATCH_MEMORY", &scratch_memory_ctx}, {"CORRELATION_ID_RETIREMENT", &corr_id_retire_ctx}, {"RCCL_API_BUFFERED", &rccl_api_buffered_ctx}, + {"OMPT_BUFFERED", &ompt_buffered_ctx}, }; -auto buffers = std::array{&runtime_init_buffered_buffer, +auto buffers = std::array{&runtime_init_buffered_buffer, &hsa_api_buffered_buffer, &hip_api_buffered_buffer, &marker_api_buffered_buffer, @@ -1051,7 +1094,8 @@ auto buffers = std::array{&runtime_init_buffered_b &page_migration_buffer, &counter_collection_buffer, &corr_id_retire_buffer, - &rccl_api_buffered_buffer}; + &rccl_api_buffered_buffer, + &ompt_buffered_buffer}; auto agents = std::vector{}; auto agents_map = std::unordered_map{}; @@ -1217,6 +1261,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "rccl api callback tracing service configure"); + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(ompt_callback_ctx, + ROCPROFILER_CALLBACK_TRACING_OMPT, + nullptr, + 0, + tool_tracing_callback, + nullptr), + "ompt callback tracing service configure"); + constexpr auto buffer_size = 8192; constexpr auto watermark = 7936; @@ -1328,6 +1381,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &rccl_api_buffered_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(ompt_buffered_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &ompt_buffered_buffer), + "buffer creation"); + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( runtime_init_buffered_ctx, ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION, @@ -1443,6 +1505,11 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) rccl_api_buffered_buffer), "buffer tracing service for rccl api configure"); + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service( + ompt_buffered_ctx, ROCPROFILER_BUFFER_TRACING_OMPT, nullptr, 0, ompt_buffered_buffer), + "buffer tracing service for ompt configure"); + ROCPROFILER_CALL( rocprofiler_configure_buffered_dispatch_counting_service( counter_collection_ctx, counter_collection_buffer, dispatch_callback, nullptr), @@ -1592,6 +1659,7 @@ tool_fini(void* tool_data) << ", memory_copy_callback_records=" << memory_copy_cb_records.size() << ", memory_allocation_callback_records=" << memory_allocation_cb_records.size() << ", rccl_api_callback_records=" << rccl_api_cb_records.size() + << ", ompt_callback_records=" << ompt_cb_records.size() << ", kernel_dispatch_bf_records=" << kernel_dispatch_bf_records.size() << ", memory_copy_bf_records=" << memory_copy_bf_records.size() << ", memory_allocation_bf_records=" << memory_allocation_bf_records.size() @@ -1603,6 +1671,7 @@ tool_fini(void* tool_data) << ", marker_api_bf_records=" << marker_api_bf_records.size() << ", corr_id_retire_records=" << corr_id_retire_records.size() << ", rccl_api_bf_records=" << rccl_api_bf_records.size() + << ", ompt_bf_records=" << ompt_bf_records.size() << ", counter_collection_value_records=" << counter_collection_bf_records.size() << "...\n" << std::flush; @@ -1694,6 +1763,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("hip_api_traces", hip_api_cb_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_cb_records)); json_ar(cereal::make_nvp("rccl_api_traces", rccl_api_cb_records)); + json_ar(cereal::make_nvp("ompt_traces", ompt_cb_records)); json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records)); json_ar(cereal::make_nvp("kernel_dispatch", kernel_dispatch_cb_records)); json_ar(cereal::make_nvp("memory_copies", memory_copy_cb_records)); @@ -1721,6 +1791,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("hip_api_traces", hip_api_bf_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_bf_records)); json_ar(cereal::make_nvp("rccl_api_traces", rccl_api_bf_records)); + json_ar(cereal::make_nvp("ompt_traces", ompt_bf_records)); json_ar(cereal::make_nvp("retired_correlation_ids", corr_id_retire_records)); json_ar(cereal::make_nvp("counter_collection", counter_collection_bf_records)); } catch(std::exception& e) @@ -1791,6 +1862,8 @@ write_perfetto() tids.emplace(itr.thread_id); for(auto itr : rccl_api_bf_records) tids.emplace(itr.thread_id); + for(auto itr : ompt_bf_records) + tids.emplace(itr.thread_id); for(auto itr : memory_copy_bf_records) { @@ -2014,6 +2087,45 @@ write_perfetto() itr.end_timestamp); } + for(auto itr : ompt_bf_records) + { + auto name = buffer_names.at(itr.kind, itr.operation); + auto& track = thread_tracks.at(itr.thread_id); + + auto _args = callback_arg_array_t{}; + auto ritr = std::find_if( + ompt_cb_records.begin(), ompt_cb_records.end(), [&itr](const auto& citr) { + return (citr.record.correlation_id.internal == itr.correlation_id.internal && + !citr.args.empty()); + }); + if(ritr != ompt_cb_records.end()) _args = ritr->args; + + TRACE_EVENT_BEGIN(sdk::perfetto_category::name, + ::perfetto::StaticString(name.data()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "tid", + itr.thread_id, + "kind", + itr.kind, + "operation", + itr.operation, + "corr_id", + itr.correlation_id.internal, + [&](::perfetto::EventContext ctx) { + for(const auto& aitr : _args) + sdk::add_perfetto_annotation(ctx, aitr.first, aitr.second); + }); + TRACE_EVENT_END(sdk::perfetto_category::name, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + for(auto itr : memory_copy_bf_records) { auto name = buffer_names.at(itr.kind, itr.operation);