From f17ff12a662084405450c656bf868a6451562f38 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 24 Jan 2022 20:49:17 -0600 Subject: [PATCH] Sampling support + testing + omnitrace namespace (#19) * omnitrace namespace * Kokkos + Lulesh example/tests * Sampling support + more - OMNITRACE_BUILD_TESTING option - sampling support - pthread_gotcha - fixes to labels for mpi_gotcha, fork_gotcha, omnitrace_component - tasking::block_signals, tasking::unblock_signals - instrumentation mode option in omnitrace exe - argument option groups in omnitrace exe - categories in omnitrace settings - remove TIMEMORY_ prefixed options * Release workflow updates * Updated settings printing * Fixed defaults in README * Tweak setting defaults in README * CMake fixes * cmake-format * clang-format * LULESH_USE_MPI OFF * LULESH_USE_MPI fix * timemory add_secondary fix * timemory ambiguous internal namespace fix * Update timemory submodule * Handle output path/prefix in omnitrace - updated timemory - updated test environment * sampling + papi fix * Fix to sampling without PAPI * Fix for using too many processors in CI * formatting * Updated CI - minor cmake tweaks - updated timemory submodule * Updated CI * Updated CI * CI + timemory updates - data race fixes * CI updates + debug for sampling * Sampling updates - moved tasking::{block,unblock}_signals to sampling namespace - improvements to sampling w.r.t. thread-locality * Minimum OMNITRACE_THREAD_COUNT of 128 * Handle multiple dims in sampler data * Configure libunwind support for timemory * Improved safeguards for sampling - updated CI - lulesh runtime-instrument test tweak * formatting * CI updates + sampler updates + misc - fixed stack-buffer-overflow in omnitrace (get_*file_line_info) - test labels - steady_clock instead of system_clock in sampler - update dyninst submodule with upgradePlaceholder fix - disable OMNITRACE_BUILD_TESTING by default * Updated timemory submodule - hidden visibility for timemory - storage finalizers do not capture this * Update timemory submodule - component visibility updates * Reworked header includes - use <...> for timemory headers - always include * Rename some config options * Update PTL submodule * Update kokkos submodule * Updated sampling * Updated CI * Reworked instrumentation exe - lowered min-address-range threshold to 256 - extended whole function exclude * CI fix + timemory submodule update - TIMEMORY_VISIBLE on component base - RelWithDebugInfo -> RelWithDebInfo - Info output for parallel-overhead * Sampling flags + transpose update + CI update - disable critical trace for parallel-overhead in CI - SA_RESTART only in sampler - reworked transpose example to use fewer threads * CI update - removed ubuntu-focal-external-debug - reduced data artifacts upload * CI timeouts - updated timemory submodule - minor tweaks to omnitrace exe logging * LICENSE updates (partial) * CI Test stage timeout extension * Docker and Packaging updates * Miscellaneous fixes/tweaks - gpu.hpp / gpu.cpp - disable roctracer component if no devices - re-enable InstrStackFrames by default - disable sampling by default - pthread_gotcha::m_enable_sampling is false by default - timemory submodule update w/ sampler and pop(tid) updates - fix minor bug in sampler logic - CMake: OMNITRACE_USE_HIP option - roctracer + timemory fix * Replaced OMNITRACE_USE_ROCTRACER with OMNITRACE_USE_HIP where appropriate * cmake format * Sampler deadlock fixes * Removed debug messages from sampler * Fix for MPI detection + test tweaks + misc * Sampler deadlock fixes + misc - removed papi_tot_ins - pthread_gotcha blocks signals globally until sampler is setup - metadata specialization for sampling components - OMNITRACE_INSTRUMENTATION_MODE -> OMNITRACE_MODE - default sampling delay increased to 0.05 from 1.0e-6 - removed {block,unblock}_signals from critical_trace and ptl - no longer necessary to use - sampling delay minimum is 1.0e-3 - OMNITRACE_BUILD_HIDDEN_VISIBILITY * omnitrace-avail + libunwind update + restructure - restructured omnitrace components - build custom omnitrace-avail executable - updated libunwind to avoid malloc in get_unw_backtrace * Fix remaining reorganization issues - removed some duplicate code - fixed some trait specializations after implicit instatiation - formatting * ensure_storage fix + avail improvements - fix ensure_storage when component not avail - suppress irrelevant info in omnitrace-avail * Delay settings initialization - slight tweak to tests w/ MPI * Disable OpenMPI testing w/ ubuntu-bionic - MPI testing is hanging bc of network interface issue on system: > [[20462,1],0]: A high-performance Open MPI point-to-point messaging module > was unable to find any relevant network interfaces: > Module: OpenFabrics (openib) > Host: fv-az19-371 > Another transport will be used instead, although this may result in > lower performance. > NOTE: You can disable this warning by setting the MCA parameter > btl_base_warn_component_unused to 0. [ROCm/rocprofiler-systems commit: 778af2a760178b9a8c06d02cf33672ed51518b80] --- .../rocprofiler-systems/.cmake-format.yaml | 11 + .../.github/workflows/linux-ci.yml | 232 +- projects/rocprofiler-systems/.gitmodules | 3 + projects/rocprofiler-systems/CMakeLists.txt | 153 +- projects/rocprofiler-systems/LICENSE | 28 +- projects/rocprofiler-systems/README.md | 62 +- .../cmake/Formatting.cmake | 5 + .../rocprofiler-systems/cmake/Packages.cmake | 61 +- .../rocprofiler-systems/docker/Dockerfile | 5 +- .../docker/build-docker-release.sh | 22 + .../docker/build-docker.sh | 8 + .../examples/CMakeLists.txt | 4 + .../examples/lulesh/CMakeLists.txt | 60 + .../lulesh/cmake/Modules/Utilities.cmake | 315 +++ .../examples/lulesh/external/CMakeLists.txt | 28 + .../examples/lulesh/external/kokkos | 1 + .../examples/lulesh/includes/Timer.hxx | 127 + .../examples/lulesh/includes/cycle.h | 545 ++++ .../examples/lulesh/lulesh-comm.cc | 2073 +++++++++++++++ .../examples/lulesh/lulesh-init.cc | 886 +++++++ .../examples/lulesh/lulesh-util.cc | 273 ++ .../examples/lulesh/lulesh-viz.cc | 422 +++ .../examples/lulesh/lulesh.cc | 2311 +++++++++++++++++ .../examples/lulesh/lulesh.h | 836 ++++++ .../examples/lulesh/lulesh_tuple.h | 651 +++++ .../examples/parallel-overhead/CMakeLists.txt | 2 +- .../parallel-overhead/parallel-overhead.cpp | 6 +- .../examples/transpose/CMakeLists.txt | 2 +- .../examples/transpose/transpose.cpp | 85 +- projects/rocprofiler-systems/external/PTL | 2 +- projects/rocprofiler-systems/external/dyninst | 2 +- .../rocprofiler-systems/external/timemory | 2 +- .../rocprofiler-systems/include/avail.hpp | 337 +++ .../rocprofiler-systems/include/library.hpp | 13 +- .../include/library/api.hpp | 2 + .../include/library/common.hpp | 12 +- .../include/library/components/backtrace.hpp | 125 + .../library/{ => components}/fork_gotcha.hpp | 16 +- .../include/library/components/fwd.hpp | 121 + .../library/{ => components}/mpi_gotcha.hpp | 18 +- .../omnitrace.hpp} | 21 +- .../library/components/pthread_gotcha.hpp | 75 + .../library/{ => components}/roctracer.hpp | 30 +- .../{ => components}/roctracer_callbacks.hpp | 18 +- .../include/library/config.hpp | 67 +- .../include/library/critical_trace.hpp | 7 +- .../include/library/debug.hpp | 13 +- .../include/library/defines.hpp.in | 12 +- .../include/library/dynamic_library.hpp | 7 +- .../include/library/gpu.hpp | 32 + .../include/library/perfetto.hpp | 5 + .../include/library/ptl.hpp | 8 +- .../include/library/sampling.hpp | 76 + .../include/library/state.hpp | 5 + .../include/library/thread_data.hpp | 10 +- .../include/library/timemory.hpp | 29 +- .../rocprofiler-systems/include/omnitrace.hpp | 130 +- .../rocprofiler-systems/scripts/.LICENSE.hpp | 28 +- .../scripts/build-release.sh | 69 +- projects/rocprofiler-systems/src/avail.cpp | 1389 ++++++++++ projects/rocprofiler-systems/src/library.cpp | 291 ++- .../src/library/components/backtrace.cpp | 590 +++++ .../library/{ => components}/fork_gotcha.cpp | 25 +- .../library/{ => components}/mpi_gotcha.cpp | 100 +- .../omnitrace.cpp} | 16 +- .../src/library/components/pthread_gotcha.cpp | 151 ++ .../library/{ => components}/roctracer.cpp | 6 +- .../{ => components}/roctracer_callbacks.cpp | 21 +- .../src/library/config.cpp | 267 +- .../src/library/critical_trace.cpp | 25 +- .../rocprofiler-systems/src/library/gpu.cpp | 51 + .../rocprofiler-systems/src/library/ptl.cpp | 33 +- .../src/library/sampling.cpp | 219 ++ .../src/library/thread_data.cpp | 3 + .../src/library/timemory.cpp | 2 + .../rocprofiler-systems/src/omnitrace.cpp | 885 +++---- .../src/omnitrace/details.cpp | 519 +++- .../rocprofiler-systems/tests/CMakeLists.txt | 265 +- 78 files changed, 14296 insertions(+), 1071 deletions(-) create mode 100755 projects/rocprofiler-systems/docker/build-docker-release.sh create mode 100755 projects/rocprofiler-systems/docker/build-docker.sh create mode 100644 projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt create mode 100644 projects/rocprofiler-systems/examples/lulesh/cmake/Modules/Utilities.cmake create mode 100644 projects/rocprofiler-systems/examples/lulesh/external/CMakeLists.txt create mode 160000 projects/rocprofiler-systems/examples/lulesh/external/kokkos create mode 100644 projects/rocprofiler-systems/examples/lulesh/includes/Timer.hxx create mode 100644 projects/rocprofiler-systems/examples/lulesh/includes/cycle.h create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh-comm.cc create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh-init.cc create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh-util.cc create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh-viz.cc create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh.cc create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh.h create mode 100644 projects/rocprofiler-systems/examples/lulesh/lulesh_tuple.h create mode 100644 projects/rocprofiler-systems/include/avail.hpp create mode 100644 projects/rocprofiler-systems/include/library/components/backtrace.hpp rename projects/rocprofiler-systems/include/library/{ => components}/fork_gotcha.hpp (83%) create mode 100644 projects/rocprofiler-systems/include/library/components/fwd.hpp rename projects/rocprofiler-systems/include/library/{ => components}/mpi_gotcha.hpp (88%) rename projects/rocprofiler-systems/include/library/{omnitrace_component.hpp => components/omnitrace.hpp} (76%) create mode 100644 projects/rocprofiler-systems/include/library/components/pthread_gotcha.hpp rename projects/rocprofiler-systems/include/library/{ => components}/roctracer.hpp (84%) rename projects/rocprofiler-systems/include/library/{ => components}/roctracer_callbacks.hpp (87%) create mode 100644 projects/rocprofiler-systems/include/library/gpu.hpp create mode 100644 projects/rocprofiler-systems/include/library/sampling.hpp create mode 100644 projects/rocprofiler-systems/src/avail.cpp create mode 100644 projects/rocprofiler-systems/src/library/components/backtrace.cpp rename projects/rocprofiler-systems/src/library/{ => components}/fork_gotcha.cpp (75%) rename projects/rocprofiler-systems/src/library/{ => components}/mpi_gotcha.cpp (58%) rename projects/rocprofiler-systems/src/library/{omnitrace_component.cpp => components/omnitrace.cpp} (86%) create mode 100644 projects/rocprofiler-systems/src/library/components/pthread_gotcha.cpp rename projects/rocprofiler-systems/src/library/{ => components}/roctracer.cpp (98%) rename projects/rocprofiler-systems/src/library/{ => components}/roctracer_callbacks.cpp (97%) create mode 100644 projects/rocprofiler-systems/src/library/gpu.cpp create mode 100644 projects/rocprofiler-systems/src/library/sampling.cpp diff --git a/projects/rocprofiler-systems/.cmake-format.yaml b/projects/rocprofiler-systems/.cmake-format.yaml index 9aa254aec7..8b5d79d794 100644 --- a/projects/rocprofiler-systems/.cmake-format.yaml +++ b/projects/rocprofiler-systems/.cmake-format.yaml @@ -18,6 +18,17 @@ parse: kwargs: VARIABLES: '*' CONDITION: '*' + omnitrace_add_test: + kwargs: + NAME: '*' + TARGET: '*' + MPI: '*' + NUM_PROCS: '*' + REWRITE_ARGS: '*' + RUNTIME_ARGS: '*' + RUN_ARGS: '*' + ENVIRONMENT: '*' + LABELS: '*' override_spec: {} vartags: [] proptags: [] diff --git a/projects/rocprofiler-systems/.github/workflows/linux-ci.yml b/projects/rocprofiler-systems/.github/workflows/linux-ci.yml index c7f89e2232..6e56c1a2c2 100644 --- a/projects/rocprofiler-systems/.github/workflows/linux-ci.yml +++ b/projects/rocprofiler-systems/.github/workflows/linux-ci.yml @@ -24,9 +24,9 @@ jobs: - name: Install Packages run: sudo apt-get update && - sudo apt-get install -y build-essential python3-pip libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev ${{ matrix.compiler }} ${{ matrix.mpi }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev ${{ matrix.compiler }} ${{ matrix.mpi }} && python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.15.3' + python3 -m pip install 'cmake==3.16.3' - name: Configure Env run: @@ -44,15 +44,17 @@ jobs: -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} -DCMAKE_INSTALL_PREFIX=/opt/omnitrace - -DOMNITRACE_USE_MPI=${USE_MPI} - -DOMNITRACE_USE_ROCTRACER=OFF + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_BUILD_DYNINST=ON + -DOMNITRACE_USE_MPI=${USE_MPI} + -DOMNITRACE_USE_HIP=OFF -DDYNINST_BUILD_ELFUTILS=ON -DDYNINST_BUILD_LIBIBERTY=ON -DDYNINST_BUILD_SHARED_LIBS=ON -DDYNINST_BUILD_STATIC_LIBS=OFF - name: Build + timeout-minutes: 45 run: cmake --build ${{ github.workspace }}/build --target all --parallel 2 -- VERBOSE=1 @@ -61,31 +63,40 @@ jobs: cmake --build ${{ github.workspace }}/build --target install --parallel 2 - name: Test + timeout-minutes: 30 working-directory: ${{ github.workspace }}/build run: ctest -V --output-log ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal.log - name: Test Install + timeout-minutes: 10 run: omnitrace --help && - omnitrace -- sleep 1 && - omnitrace -o sleep.inst -- sleep && - ./sleep.inst 1 && - rm ./sleep.inst + omnitrace -e -v 1 -o ls.inst -- ls && + ./ls.inst && + rm ./ls.inst && + omnitrace -e -v 1 -- ls - - name: Artifacts + - name: CTest Artifacts uses: actions/upload-artifact@v2 with: name: ctest-log path: | - ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal.log + ${{ github.workspace }}/build/*.log + + - name: Data Artifacts + uses: actions/upload-artifact@v2 + with: + name: data-files + path: | + ${{ github.workspace }}/build/omnitrace-tests-output/*.txt ubuntu-bionic: runs-on: ubuntu-18.04 strategy: matrix: compiler: ['g++-7', 'g++-8'] - mpi: [ '', 'libmpich-dev mpich', 'libopenmpi-dev openmpi-bin libfabric-dev' ] + mpi: [ '', 'libmpich-dev mpich' ] steps: - uses: actions/checkout@v2 @@ -93,9 +104,9 @@ jobs: - name: Install Packages run: sudo apt-get update && - sudo apt-get install -y build-essential python3-pip ${{ matrix.compiler }} ${{ matrix.mpi }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.compiler }} ${{ matrix.mpi }} && python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.15.3' + python3 -m pip install 'cmake==3.16.3' - name: Configure Env run: @@ -113,15 +124,17 @@ jobs: -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} -DCMAKE_INSTALL_PREFIX=/opt/omnitrace - -DOMNITRACE_USE_MPI=${USE_MPI} - -DOMNITRACE_USE_ROCTRACER=OFF + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_BUILD_DYNINST=ON + -DOMNITRACE_USE_MPI=${USE_MPI} + -DOMNITRACE_USE_HIP=OFF -DDYNINST_BUILD_TBB=ON -DDYNINST_BUILD_BOOST=ON -DDYNINST_BUILD_ELFUTILS=ON -DDYNINST_BUILD_LIBIBERTY=ON - name: Build + timeout-minutes: 45 run: cmake --build ${{ github.workspace }}/build --target all --parallel 2 -- VERBOSE=1 @@ -130,24 +143,33 @@ jobs: cmake --build ${{ github.workspace }}/build --target install --parallel 2 - name: Test + timeout-minutes: 30 working-directory: ${{ github.workspace }}/build run: ctest -V --output-log ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-bionic.log - name: Test Install + timeout-minutes: 10 run: omnitrace --help && - omnitrace -- sleep 1 && - omnitrace -o sleep.inst -- sleep && - ./sleep.inst 1 && - rm ./sleep.inst + omnitrace -e -v 1 -o ls.inst -- ls && + ./ls.inst && + rm ./ls.inst && + omnitrace -e -v 1 -- ls - - name: Artifacts + - name: CTest Artifacts uses: actions/upload-artifact@v2 with: name: ctest-log path: | - ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-bionic.log + ${{ github.workspace }}/build/*.log + + - name: Data Artifacts + uses: actions/upload-artifact@v2 + with: + name: data-files + path: | + ${{ github.workspace }}/build/omnitrace-tests-output/*.txt ubuntu-focal-external: runs-on: ubuntu-20.04 @@ -161,15 +183,15 @@ jobs: - name: Install Packages run: sudo apt-get update && - sudo apt-get install -y build-essential python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev ${{ matrix.compiler }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev ${{ matrix.compiler }} && sudo python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.15.3' + python3 -m pip install 'cmake==3.16.3' - name: Configure Env run: echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && - echo "CMAKE_PREFIX_PATH=/opt/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && + echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH && echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV @@ -193,7 +215,7 @@ jobs: cmake -B build -DCMAKE_C_COMPILER=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g') -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} - -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} + -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/dyninst && cmake --build build --target all --parallel 2 && cmake --build build --target install --parallel 2 && @@ -205,12 +227,14 @@ jobs: cmake -B ${{ github.workspace }}/build -DCMAKE_C_COMPILER=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g') -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} - -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} + -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/opt/omnitrace + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_USE_MPI=OFF - -DOMNITRACE_USE_ROCTRACER=OFF + -DOMNITRACE_USE_HIP=OFF - name: Build + timeout-minutes: 45 run: cmake --build ${{ github.workspace }}/build --target all --parallel 2 -- VERBOSE=1 @@ -219,6 +243,7 @@ jobs: cmake --build ${{ github.workspace }}/build --target install --parallel 2 - name: Test + timeout-minutes: 30 working-directory: ${{ github.workspace }}/build run: ldd ./omnitrace && @@ -226,20 +251,28 @@ jobs: ctest -V --output-log ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal-external.log - name: Test Install + timeout-minutes: 10 run: ldd $(which omnitrace) && omnitrace --help && - omnitrace -- sleep 1 && - omnitrace -o sleep.inst -- sleep && - ./sleep.inst 1 && - rm ./sleep.inst + omnitrace -e -v 1 -o ls.inst -- ls && + ./ls.inst && + rm ./ls.inst && + omnitrace -e -v 1 -- ls - - name: Artifacts + - name: CTest Artifacts uses: actions/upload-artifact@v2 with: name: ctest-log path: | - ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal-external.log + ${{ github.workspace }}/build/*.log + + - name: Data Artifacts + uses: actions/upload-artifact@v2 + with: + name: data-files + path: | + ${{ github.workspace }}/build/omnitrace-tests-output/*.txt ubuntu-focal-dyninst-package: runs-on: ubuntu-20.04 @@ -253,15 +286,15 @@ jobs: - name: Install Packages run: sudo apt-get update && - sudo apt-get install -y build-essential python3-pip ${{ matrix.compiler }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.compiler }} && sudo python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.15.3' + python3 -m pip install 'cmake==3.16.3' - name: Configure Env run: echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && - echo "CMAKE_PREFIX_PATH=/opt/opt/dyninst:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && + echo "CMAKE_PREFIX_PATH=/opt/dyninst:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && echo "/opt/omnitrace/bin:/opt/dyninst/bin:${HOME}/.local/bin" >> $GITHUB_PATH && echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV @@ -292,10 +325,12 @@ jobs: -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} -DCMAKE_INSTALL_PREFIX=/opt/omnitrace + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_USE_MPI=OFF - -DOMNITRACE_USE_ROCTRACER=OFF + -DOMNITRACE_USE_HIP=OFF - name: Build + timeout-minutes: 45 run: cmake --build ${{ github.workspace }}/build --target all --parallel 2 -- VERBOSE=1 @@ -304,6 +339,7 @@ jobs: cmake --build ${{ github.workspace }}/build --target install --parallel 2 - name: Test + timeout-minutes: 30 working-directory: ${{ github.workspace }}/build run: ldd ./omnitrace && @@ -311,17 +347,127 @@ jobs: ctest -V --output-log ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal-dyninst-package.log - name: Test Install + timeout-minutes: 10 run: ldd $(which omnitrace) && omnitrace --help && - omnitrace -- sleep 1 && - omnitrace -o sleep.inst -- sleep && - ./sleep.inst 1 && - rm ./sleep.inst + omnitrace -e -v 1 -o ls.inst -- ls && + ./ls.inst && + rm ./ls.inst && + omnitrace -e -v 1 -- ls - - name: Artifacts + - name: CTest Artifacts uses: actions/upload-artifact@v2 with: name: ctest-log path: | - ${{ github.workspace }}/build/omnitrace-ctest-ubuntu-focal-dyninst-package.log + ${{ github.workspace }}/build/*.log + + - name: Data Artifacts + uses: actions/upload-artifact@v2 + with: + name: data-files + path: | + ${{ github.workspace }}/build/omnitrace-tests-output/*.txt + + ubuntu-focal-external-rocm: + runs-on: ubuntu-20.04 + strategy: + matrix: + compiler: ['g++'] + rocm_version: ['4.3', '4.3.1', '4.5'] + mpi: [ 'libmpich-dev mpich', 'libopenmpi-dev openmpi-bin libfabric-dev' ] + + steps: + - uses: actions/checkout@v2 + + - name: Install Packages + run: + echo '1' | sudo tee /proc/sys/kernel/perf_event_paranoid && + sudo apt-get update && + sudo apt-get install -y software-properties-common wget gnupg2 && + sudo wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - && + echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | sudo tee /etc/apt/sources.list.d/rocm.list && + sudo apt-get update && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev ${{ matrix.compiler }} libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${{ matrix.mpi }} libpapi-dev && + sudo python3 -m pip install --upgrade pip && + python3 -m pip install 'cmake==3.16.3' + + - name: Configure Env + run: + echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && + echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && + echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && + echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH && + echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + + - name: Install ElfUtils + run: + pushd external && + wget https://sourceware.org/elfutils/ftp/${ELFUTILS_DOWNLOAD_VERSION}/elfutils-${ELFUTILS_DOWNLOAD_VERSION}.tar.bz2 && + tar xjf elfutils-${ELFUTILS_DOWNLOAD_VERSION}.tar.bz2 && + pushd elfutils-${ELFUTILS_DOWNLOAD_VERSION} && + CFLAGS="-O3" ./configure --enable-install-elfh --prefix=/opt/elfutils --disable-libdebuginfod --disable-debuginfod && + make -j2 && + make install -j2 && + popd && + rm -rf elfutils* + + - name: Install Dyninst + run: + cmake --version && + git submodule update --init external/dyninst && + cd external/dyninst && + cmake -B build + -DCMAKE_C_COMPILER=${CC} + -DCMAKE_CXX_COMPILER=${CXX} + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=/opt/dyninst && + cmake --build build --target all --parallel 2 && + cmake --build build --target install --parallel 2 && + rm -rf build + + - name: Configure CMake + run: + cmake --version && + cmake -B ${{ github.workspace }}/build + -DCMAKE_C_COMPILER=${CC} + -DCMAKE_CXX_COMPILER=${CXX} + -DCMAKE_BUILD_TYPE=RelWithDebInfo + -DCMAKE_INSTALL_PREFIX=/opt/omnitrace + -DOMNITRACE_BUILD_TESTING=OFF + -DOMNITRACE_BUILD_DEVELOPER=ON + -DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF + -DOMNITRACE_BUILD_LTO=OFF + -DOMNITRACE_USE_MPI=OFF + -DOMNITRACE_USE_MPI_HEADERS=ON + -DOMNITRACE_USE_HIP=ON + -DOMNITRACE_MAX_THREADS=256 + -DOMNITRACE_USE_SANITIZER=OFF + -DTIMEMORY_USE_PAPI=ON + + - name: Build + timeout-minutes: 45 + run: + cmake --build ${{ github.workspace }}/build --target all --parallel 2 -- VERBOSE=1 + + - name: Install + run: + cmake --build ${{ github.workspace }}/build --target install --parallel 2 + + - name: Test + timeout-minutes: 30 + working-directory: ${{ github.workspace }}/build + run: + ldd ./omnitrace && + ./omnitrace --help + + - name: Test Install + timeout-minutes: 10 + run: + ldd $(which omnitrace) && + omnitrace --help && + omnitrace -e -v 1 -o ls.inst -- ls && + ./ls.inst && + rm ./ls.inst && + omnitrace -e -v 1 -- ls diff --git a/projects/rocprofiler-systems/.gitmodules b/projects/rocprofiler-systems/.gitmodules index 6a4abb2132..45cee55b0d 100644 --- a/projects/rocprofiler-systems/.gitmodules +++ b/projects/rocprofiler-systems/.gitmodules @@ -13,3 +13,6 @@ [submodule "external/PTL"] path = external/PTL url = https://github.com/jrmadsen/PTL.git +[submodule "external/kokkos"] + path = examples/lulesh/external/kokkos + url = https://github.com/kokkos/kokkos.git diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index d5be7366e9..0020562b41 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -58,21 +58,37 @@ set(CMAKE_CXX_STANDARD 17 CACHE STRING "CXX language standard") omnitrace_add_feature(CMAKE_CXX_STANDARD "CXX language standard") +omnitrace_add_feature(CMAKE_BUILD_TYPE "Build optimization level") omnitrace_add_option(CMAKE_CXX_STANDARD_REQUIRED "Require C++ language standard" ON) omnitrace_add_option(CMAKE_CXX_EXTENSIONS "Compiler specific language extensions" OFF) omnitrace_add_option(CMAKE_INSTALL_RPATH_USE_LINK_PATH "Enable rpath to linked libraries" ON) + omnitrace_add_option(OMNITRACE_USE_CLANG_TIDY "Enable clang-tidy" OFF) omnitrace_add_option(OMNITRACE_USE_MPI "Enable MPI support" OFF) -omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF) -omnitrace_add_option(OMNITRACE_USE_ROCTRACER "Enable roctracer support" ON) -omnitrace_add_option(OMNITRACE_BUILD_DYNINST "Build dyninst from submodule" OFF) +omnitrace_add_option(OMNITRACE_USE_HIP "Enable HIP support" ON) +omnitrace_add_option(OMNITRACE_USE_ROCTRACER "Enable roctracer support" + ${OMNITRACE_USE_HIP}) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" OFF) +omnitrace_add_option(OMNITRACE_BUILD_DYNINST "Build dyninst from submodule" OFF) +omnitrace_add_option(OMNITRACE_BUILD_TESTING "Enable building the testing suite" OFF) +omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF) +omnitrace_add_option(OMNITRACE_BUILD_HIDDEN_VISIBILITY + "Build with hidden visibility (disable for Debug builds)" ON) + +if(NOT OMNITRACE_USE_HIP) + set(OMNITRACE_USE_ROCTRACER + OFF + CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) +endif() include(ProcessorCount) processorcount(OMNITRACE_PROCESSOR_COUNT) -math(EXPR OMNITRACE_THREAD_COUNT "8 * ${OMNITRACE_PROCESSOR_COUNT}") +math(EXPR OMNITRACE_THREAD_COUNT "16 * ${OMNITRACE_PROCESSOR_COUNT}") +if(OMNITRACE_THREAD_COUNT LESS 128) + set(OMNITRACE_THREAD_COUNT 128) +endif() set(OMNITRACE_MAX_THREADS "${OMNITRACE_THREAD_COUNT}" CACHE @@ -81,24 +97,25 @@ set(OMNITRACE_MAX_THREADS ) omnitrace_add_feature( OMNITRACE_MAX_THREADS - "Maximum number of total threads supported in the host application (default: 8 * nproc)" + "Maximum number of total threads supported in the host application (default: max of 128 or 16 * nproc)" ) -# ensure synced -set(TIMEMORY_USE_MPI - ${OMNITRACE_USE_MPI} - CACHE BOOL "Enable MPI support" FORCE) - # default visibility settings -set(CMAKE_C_VISIBILITY_PRESET "default") -set(CMAKE_CXX_VISIBILITY_PRESET "default") -set(CMAKE_VISIBILITY_INLINES_HIDDEN OFF) +set(CMAKE_C_VISIBILITY_PRESET + "default" + CACHE STRING "Visibility preset for non-inline C functions") +set(CMAKE_CXX_VISIBILITY_PRESET + "default" + CACHE STRING "Visibility preset for non-inline C++ functions/objects") +set(CMAKE_VISIBILITY_INLINES_HIDDEN + OFF + CACHE BOOL "Visibility preset for inline functions") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) include(Formatting) # format target include(Packages) # finds third-party libraries -if(OMNITRACE_USE_ROCTRACER) +if(OMNITRACE_USE_HIP OR OMNITRACE_USE_ROCTRACER) find_package(HIP QUIET) if(HIP_VERSION_MAJOR GREATER_EQUAL 4 AND HIP_VERSION_MINOR GREATER 3) set(roctracer_kfdwrapper_LIBRARY) @@ -116,9 +133,11 @@ configure_file(${PROJECT_SOURCE_DIR}/include/library/defines.hpp.in omnitrace_activate_clang_tidy() # custom visibility settings -set(CMAKE_C_VISIBILITY_PRESET "hidden") -set(CMAKE_CXX_VISIBILITY_PRESET "hidden") -set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) +if(OMNITRACE_BUILD_HIDDEN_VISIBILITY) + set(CMAKE_C_VISIBILITY_PRESET "hidden") + set(CMAKE_CXX_VISIBILITY_PRESET "hidden") + set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) +endif() if(OMNITRACE_BUILD_LTO) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON) @@ -134,13 +153,17 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/src/library.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/config.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/critical_trace.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/fork_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/omnitrace_component.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/mpi_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/gpu.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/perfetto.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/ptl.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/sampling.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/thread_data.cpp ${CMAKE_CURRENT_LIST_DIR}/src/library/timemory.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/backtrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/fork_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/mpi_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/omnitrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/pthread_gotcha.cpp ${perfetto_DIR}/sdk/perfetto.cc) set(library_headers @@ -150,49 +173,53 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/include/library/common.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/critical_trace.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/debug.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/fork_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/omnitrace_component.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/mpi_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/gpu.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/ptl.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/sampling.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/state.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/thread_data.hpp ${CMAKE_CURRENT_LIST_DIR}/include/library/timemory.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fwd.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/backtrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fork_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/mpi_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/omnitrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/pthread_gotcha.hpp ${perfetto_DIR}/sdk/perfetto.h) -if(NOT TIMEMORY_USE_PERFETTO) - -endif() - add_library(omnitrace-library SHARED ${library_sources} ${library_headers}) if(OMNITRACE_USE_ROCTRACER) target_sources( omnitrace-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include/library/roctracer.hpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/roctracer.cpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/roctracer_callbacks.hpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/roctracer_callbacks.cpp) + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer_callbacks.cpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer_callbacks.hpp) endif() target_include_directories(omnitrace-library SYSTEM PRIVATE ${perfetto_DIR}/sdk) target_compile_definitions( omnitrace-library - PRIVATE $,CUSTOM_DATA_SOURCE,>) + PRIVATE OMNITRACE_MAX_THREADS=${OMNITRACE_MAX_THREADS} + $,CUSTOM_DATA_SOURCE,>) target_link_libraries( omnitrace-library - PRIVATE omnitrace::omnitrace-headers - omnitrace::omnitrace-threading - omnitrace::omnitrace-compile-options - omnitrace::omnitrace-roctracer - omnitrace::omnitrace-mpi - omnitrace::omnitrace-ptl - $ - $ - $ - $,omnitrace::omnitrace-sanitizer,>) + PUBLIC $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $,omnitrace::omnitrace-sanitizer,>) if(OMNITRACE_DYNINST_API_RT) get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}" @@ -200,14 +227,35 @@ if(OMNITRACE_DYNINST_API_RT) endif() set_target_properties( - omnitrace-library PROPERTIES OUTPUT_NAME omnitrace - INSTALL_RPATH "\$ORIGIN:\$ORIGIN/dyninst-tpls/libs") + omnitrace-library + PROPERTIES OUTPUT_NAME omnitrace + INSTALL_RPATH + "\$ORIGIN:\$ORIGIN/timemory/libunwind:\$ORIGIN/dyninst-tpls/libs") install( TARGETS omnitrace-library DESTINATION ${CMAKE_INSTALL_LIBDIR} OPTIONAL) +# ------------------------------------------------------------------------------# +# +# omnitrace-avail target +# +# ------------------------------------------------------------------------------# + +add_executable(omnitrace-avail ${CMAKE_CURRENT_LIST_DIR}/src/avail.cpp + ${CMAKE_CURRENT_LIST_DIR}/include/avail.hpp) + +target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include) +target_compile_definitions(omnitrace-avail PRIVATE OMNITRACE_EXTERN_COMPONENTS=0) +target_link_libraries(omnitrace-avail PRIVATE omnitrace-library) +set_target_properties(omnitrace-avail PROPERTIES INSTALL_RPATH_USE_LINK_PATH ON) + +install( + TARGETS omnitrace-avail + DESTINATION bin + OPTIONAL) + # ------------------------------------------------------------------------------# # # omnitrace-exe target @@ -234,7 +282,7 @@ set_target_properties( OUTPUT_NAME omnitrace INSTALL_RPATH_USE_LINK_PATH ON INSTALL_RPATH - "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib" + "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/timemory/libunwind:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib" ) install( @@ -242,9 +290,6 @@ install( DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL) -# build the timemory-avail exe -add_dependencies(omnitrace-exe timemory-avail) - # ------------------------------------------------------------------------------# # # miscellaneous installs @@ -275,7 +320,9 @@ install( # # ------------------------------------------------------------------------------# -add_subdirectory(examples) +if(OMNITRACE_BUILD_TESTING) + add_subdirectory(examples) +endif() # ------------------------------------------------------------------------------# # @@ -283,10 +330,12 @@ add_subdirectory(examples) # # ------------------------------------------------------------------------------# -include(CTest) -enable_testing() +if(OMNITRACE_BUILD_TESTING) + include(CTest) + enable_testing() -add_subdirectory(tests) + add_subdirectory(tests) +endif() # ------------------------------------------------------------------------------# # diff --git a/projects/rocprofiler-systems/LICENSE b/projects/rocprofiler-systems/LICENSE index d92876e2a5..5477049050 100644 --- a/projects/rocprofiler-systems/LICENSE +++ b/projects/rocprofiler-systems/LICENSE @@ -1,27 +1,21 @@ -Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +MIT License + +Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal -with the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - - * Neither the names of Advanced Micro Devices, Inc. nor the names of its - contributors may be used to endorse or promote products derived from - this Software without specific prior written permission. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH -THE SOFTWARE. +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/projects/rocprofiler-systems/README.md b/projects/rocprofiler-systems/README.md index 679e0002fa..3f61bd9084 100755 --- a/projects/rocprofiler-systems/README.md +++ b/projects/rocprofiler-systems/README.md @@ -53,19 +53,53 @@ omnitrace -- ## Omnitrace Library Environment Settings -| Environment Variable | Default Value | Description | -|-----------------------------|-------------------------------|----------------------------------------------------------------------------------| -| `OMNITRACE_DEBUG` | `false` | Enable debugging statements | -| `OMNITRACE_USE_PERFETTO` | `true` | Collect profiling data via perfetto | -| `OMNITRACE_USE_TIMEMORY` | `false` | Collection profiling data via timemory | -| `OMNITRACE_SAMPLE_RATE` | `1` | Invoke perfetto and/or timemory once every N function calls | -| `OMNITRACE_USE_MPI` | `true` | Label perfetto output files via rank instead of PID | -| `OMNITRACE_OUTPUT_FILE` | `perfetto-trace.%rank%.proto` | Output file for perfetto (may use `%pid`) | -| `OMNITRACE_BACKEND` | `"inprocess"` | Configure perfetto to use either "inprocess" data management, "system", or "all" | -| `OMNITRACE_COMPONENTS` | `"wall_clock"` | Timemory components to activate when enabled | -| `OMNITRACE_SHMEM_SIZE_HINT` | `40960` | Hint for perfetto shared memory buffer | -| `OMNITRACE_BUFFER_SIZE_KB` | `1024000` | Maximum amount of memory perfetto will use to collect data in-process | -| `TIMEMORY_TIME_OUTPUT` | `true` | Create unique output subdirectory with date and launch time | +| Environment Variable | Default Value | Description | +|--------------------------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------| +| `OMNITRACE_USE_PERFETTO` | `false` | Enable perfetto backend | +| `OMNITRACE_USE_PID` | `true` | Enable tagging filenames with process identifier (either MPI rank or pid) | +| `OMNITRACE_USE_ROCTRACER` | `true` | Enable ROCM tracing | +| `OMNITRACE_USE_SAMPLING` | `true` | Enable statistical sampling of call-stack | +| `OMNITRACE_USE_TIMEMORY` | `false` | Enable timemory backend | +| `OMNITRACE_BACKEND` | `inprocess` | Specify the perfetto backend to activate. Options are: 'inprocess', 'system', or 'all' | +| `OMNITRACE_BUFFER_SIZE_KB` | `1024000` | Size of perfetto buffer (in KB) | +| `OMNITRACE_COUT_OUTPUT` | `false` | Write output to stdout | +| `OMNITRACE_CRITICAL_TRACE` | `false` | Enable generation of the critical trace | +| `OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT` | `2000` | Number of critical trace records to store in thread-local memory before submitting to shared buffer | +| `OMNITRACE_CRITICAL_TRACE_COUNT` | `0` | Number of critical trace to export (0 == all) | +| `OMNITRACE_CRITICAL_TRACE_DEBUG` | `false` | Enable debugging for critical trace | +| `OMNITRACE_CRITICAL_TRACE_NUM_THREADS` | `8` | Number of threads to use when generating the critical trace | +| `OMNITRACE_CRITICAL_TRACE_PER_ROW` | `0` | How many critical traces per row in perfetto (0 == all in one row) | +| `OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES` | `false` | Include names in serialization of critical trace (mainly for debugging) | +| `OMNITRACE_DIFF_OUTPUT` | `false` | Generate a difference output vs. a pre-existing output (see also: TIMEMORY_INPUT_PATH and TIMEMORY_INPUT_PREFIX) | +| `OMNITRACE_FLAT_SAMPLING` | `false` | Ignore hierarchy in all statistical sampling entries | +| `OMNITRACE_INSTRUMENTATION_INTERVAL` | `1` | Instrumentation only takes measurements once every N function calls (not statistical) | +| `OMNITRACE_JSON_OUTPUT` | `true` | Write json output files | +| `OMNITRACE_MEMORY_PRECISION` | `-1` | Set the precision for components with 'is_memory_category' type-trait | +| `OMNITRACE_MEMORY_SCIENTIFIC` | `false` | Set the numerical reporting format for components with 'is_memory_category' type-trait | +| `OMNITRACE_MEMORY_UNITS` | `""` | Set the units for components with 'uses_memory_units' type-trait | +| `OMNITRACE_OUTPUT_FILE` | `""` | Perfetto filename | +| `OMNITRACE_OUTPUT_PATH` | `omnitrace-{EXE}-output` | Explicitly specify the output folder for results | +| `OMNITRACE_OUTPUT_PREFIX` | `""` | Explicitly specify a prefix for all output files | +| `OMNITRACE_PRECISION` | `-1` | Set the global output precision for components | +| `OMNITRACE_ROCTRACER_FLAT_PROFILE` | `false` | Ignore hierarchy in all kernels entries with timemory backend | +| `OMNITRACE_ROCTRACER_HSA_ACTIVITY` | `false` | Enable HSA activity tracing support | +| `OMNITRACE_ROCTRACER_HSA_API` | `false` | Enable HSA API tracing support | +| `OMNITRACE_ROCTRACER_HSA_API_TYPES` | `""` | HSA API type to collect | +| `OMNITRACE_ROCTRACER_TIMELINE_PROFILE` | `false` | Create unique entries for every kernel with timemory backend | +| `OMNITRACE_SAMPLING_DELAY` | `1e-06` | Number of seconds to delay activating the statistical sampling | +| `OMNITRACE_SAMPLING_FREQ` | `10` | Number of software interrupts per second when OMNITTRACE_USE_SAMPLING=ON | +| `OMNITRACE_SCIENTIFIC` | `false` | Set the global numerical reporting to scientific format | +| `OMNITRACE_SETTINGS_DESC` | `false` | Provide descriptions when printing settings | +| `OMNITRACE_SHMEM_SIZE_HINT_KB` | `40960` | Hint for shared-memory buffer size in perfetto (in KB) | +| `OMNITRACE_TEXT_OUTPUT` | `true` | Write text output files | +| `OMNITRACE_TIMELINE_SAMPLING` | `false` | Create unique entries for every sample when statistical sampling is enabled | +| `OMNITRACE_TIMEMORY_COMPONENTS` | `wall_clock` | List of components to collect via timemory (see timemory-avail) | +| `OMNITRACE_TIME_FORMAT` | `%F_%I.%M_%p` | Customize the folder generation when TIMEMORY_TIME_OUTPUT is enabled (see also: strftime) | +| `OMNITRACE_TIME_OUTPUT` | `true` | Output data to subfolder w/ a timestamp (see also: TIMEMORY_TIME_FORMAT) | +| `OMNITRACE_TIMING_PRECISION` | `6` | Set the precision for components with 'is_timing_category' type-trait | +| `OMNITRACE_TIMING_SCIENTIFIC` | `false` | Set the numerical reporting format for components with 'is_timing_category' type-trait | +| `OMNITRACE_TIMING_UNITS` | `""` | Set the units for components with 'uses_timing_units' type-trait | +| `OMNITRACE_TREE_OUTPUT` | `true` | Write hierarchical json output files | ### Example Omnitrace Instrumentation @@ -165,7 +199,7 @@ variable. The special character sequences `%pid%` and `%rank%` will be replaced ## Merging the traces from rocprof and omnitrace -> NOTE: Using `rocprof` externally is deprecated. The current version has built-in support for +> NOTE: Using `rocprof` externally for tracing is deprecated. The current version has built-in support for > recording the GPU activity and HIP API calls. If you want to use an external rocprof, either > configure CMake with `-DOMNITRACE_USE_ROCTRACER=OFF` or explicitly set `TIMEMORY_ROCTRACER_ENABLED=OFF` in the > environment. diff --git a/projects/rocprofiler-systems/cmake/Formatting.cmake b/projects/rocprofiler-systems/cmake/Formatting.cmake index f35118b6de..44378e7537 100644 --- a/projects/rocprofiler-systems/cmake/Formatting.cmake +++ b/projects/rocprofiler-systems/cmake/Formatting.cmake @@ -45,6 +45,11 @@ if(OMNITRACE_CLANG_FORMAT_EXE) file(GLOB_RECURSE headers ${PROJECT_SOURCE_DIR}/include/*.hpp) file(GLOB_RECURSE examples ${PROJECT_SOURCE_DIR}/examples/*.cpp ${PROJECT_SOURCE_DIR}/examples/*.hpp) + file(GLOB_RECURSE external ${PROJECT_SOURCE_DIR}/examples/lulesh/external/*.cpp + ${PROJECT_SOURCE_DIR}/examples/lulesh/external/*.hpp) + if(external) + list(REMOVE_ITEM examples ${external}) + endif() add_custom_target( format-omnitrace ${OMNITRACE_CLANG_FORMAT_EXE} -i ${sources} ${headers} ${examples} diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index 2661b1bf99..daa774da0e 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -13,6 +13,7 @@ omnitrace_add_interface_library(omnitrace-threading "Enables multithreading supp omnitrace_add_interface_library( omnitrace-dyninst "Provides flags and libraries for Dyninst (dynamic instrumentation)") +omnitrace_add_interface_library(omnitrace-hip "Provides flags and libraries for HIP") omnitrace_add_interface_library(omnitrace-roctracer "Provides flags and libraries for roctracer") omnitrace_add_interface_library(omnitrace-mpi "Provides MPI or MPI headers") @@ -24,6 +25,9 @@ target_include_directories(omnitrace-headers INTERFACE ${PROJECT_SOURCE_DIR}/inc # include threading because of rooflines target_link_libraries(omnitrace-headers INTERFACE omnitrace-threading) +# ensure the env overrides the appending /opt/rocm later +string(REPLACE ":" ";" CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH};${CMAKE_PREFIX_PATH}") + # ----------------------------------------------------------------------------------------# # # Threading @@ -47,6 +51,19 @@ if(pthread_LIBRARY AND NOT WIN32) target_link_libraries(omnitrace-threading INTERFACE ${pthread_LIBRARY}) endif() +# ----------------------------------------------------------------------------------------# +# +# HIP +# +# ----------------------------------------------------------------------------------------# + +if(OMNITRACE_USE_HIP) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + find_package(hip ${omnitrace_FIND_QUIETLY} REQUIRED) + target_compile_definitions(omnitrace-hip INTERFACE OMNITRACE_USE_HIP) + target_link_libraries(omnitrace-hip INTERFACE hip::host) +endif() + # ----------------------------------------------------------------------------------------# # # roctracer @@ -56,9 +73,9 @@ endif() if(OMNITRACE_USE_ROCTRACER) list(APPEND CMAKE_PREFIX_PATH /opt/rocm) find_package(roctracer ${omnitrace_FIND_QUIETLY} REQUIRED) - find_package(hip ${omnitrace_FIND_QUIETLY} REQUIRED) target_compile_definitions(omnitrace-roctracer INTERFACE OMNITRACE_USE_ROCTRACER) - target_link_libraries(omnitrace-roctracer INTERFACE hip::host roctracer::roctracer) + target_link_libraries(omnitrace-roctracer INTERFACE roctracer::roctracer + omnitrace::omnitrace-hip) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${roctracer_LIBRARY_DIRS}") endif() @@ -297,45 +314,77 @@ set(TIMEMORY_BUILD_TOOLS set(TIMEMORY_BUILD_EXCLUDE_FROM_ALL ON CACHE BOOL "Set timemory to only build dependencies") +set(TIMEMORY_BUILD_HIDDEN_VISIBILITY + ON + CACHE BOOL "Build timemory with hidden visibility") set(TIMEMORY_QUIET_CONFIG ON CACHE BOOL "Make timemory configuration quieter") # timemory feature settings +set(TIMEMORY_USE_MPI + ${OMNITRACE_USE_MPI} + CACHE BOOL "Enable MPI support in timemory" FORCE) set(TIMEMORY_USE_GOTCHA ON CACHE BOOL "Enable GOTCHA support in timemory") set(TIMEMORY_USE_PERFETTO OFF CACHE BOOL "Disable perfetto support in timemory") +set(TIMEMORY_USE_LIBUNWIND + ON + CACHE BOOL "Enable libunwind support in timemory") + # timemory feature build settings set(TIMEMORY_BUILD_GOTCHA ON CACHE BOOL "Enable building GOTCHA library from submodule") +set(TIMEMORY_BUILD_LIBUNWIND + ON + CACHE BOOL "Enable building libunwind library from submodule") +set(TIMEMORY_BUILD_EXTRA_OPTIMIZATIONS + ${OMNITRACE_BUILD_EXTRA_OPTIMIZATIONS} + CACHE BOOL "Enable building GOTCHA library from submodule" FORCE) + # timemory build settings set(TIMEMORY_TLS_MODEL "global-dynamic" CACHE STRING "Thread-local static model" FORCE) +set(TIMEMORY_SETTINGS_PREFIX + "OMNITRACE_" + CACHE STRING "Prefix used for settings and environment variables") +mark_as_advanced(TIMEMORY_SETTINGS_PREFIX) + omnitrace_checkout_git_submodule( RELATIVE_PATH external/timemory WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} REPO_URL https://github.com/NERSC/timemory.git REPO_BRANCH gpu-kernel-instrumentation) -omnitrace_save_variables(BUILD_CONFIG VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS - CMAKE_POSITION_INDEPENDENT_CODE) +omnitrace_save_variables( + BUILD_CONFIG VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS + CMAKE_POSITION_INDEPENDENT_CODE CMAKE_PREFIX_PATH) # ensure timemory builds PIC static libs so that we don't have to install timemory shared # lib set(BUILD_SHARED_LIBS ON) set(BUILD_STATIC_LIBS OFF) set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(TIMEMORY_CTP_OPTIONS GLOBAL) + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # results in undefined symbols to component::base::load() + set(TIMEMORY_BUILD_HIDDEN_VISIBILITY + OFF + CACHE BOOL "" FORCE) +endif() add_subdirectory(external/timemory) -omnitrace_restore_variables(BUILD_CONFIG VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS - CMAKE_POSITION_INDEPENDENT_CODE) +omnitrace_restore_variables( + BUILD_CONFIG VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS + CMAKE_POSITION_INDEPENDENT_CODE CMAKE_PREFIX_PATH) # ----------------------------------------------------------------------------------------# # diff --git a/projects/rocprofiler-systems/docker/Dockerfile b/projects/rocprofiler-systems/docker/Dockerfile index 1ef4456874..99db7f3f5d 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile +++ b/projects/rocprofiler-systems/docker/Dockerfile @@ -13,12 +13,13 @@ WORKDIR /tmp SHELL [ "/bin/bash", "-c" ] ARG EXTRA_PACKAGES="" +ARG ROCM_REPO_VERSION="debian" RUN apt-get update && \ apt-get dist-upgrade -y && \ - apt-get install -y build-essential cmake libnuma-dev wget gnupg2 m4 bash-completion git-core && \ + apt-get install -y build-essential cmake libnuma-dev wget gnupg2 m4 bash-completion git-core autoconf libtool autotools-dev && \ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ ubuntu main' | tee /etc/apt/sources.list.d/rocm.list && \ + echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_REPO_VERSION}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list && \ apt-get update && \ apt-get dist-upgrade -y && \ apt-get install -y rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} diff --git a/projects/rocprofiler-systems/docker/build-docker-release.sh b/projects/rocprofiler-systems/docker/build-docker-release.sh new file mode 100755 index 0000000000..702d977871 --- /dev/null +++ b/projects/rocprofiler-systems/docker/build-docker-release.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +if [ ! -f CMakeLists.txt ]; then + echo "Error! Execute script from source directory" + exit 1 +fi + +set -e + +build-release() +{ + CONTAINER=$1 + ROCM_VERSION=$2 + CODE_VERSION=$3 + docker run -it --rm -v ${PWD}:/home/omnitrace --env ROCM_VERSION=${ROCM_VERSION} --env VERSION=${CODE_VERSION} ${CONTAINER} /home/omnitrace/scripts/build-release.sh +} + +CODE_VERSION=$(cat VERSION) + +build-release jrmadsen/omnitrace-base-rocm-4.5 4.5.0 ${CODE_VERSION} +build-release jrmadsen/omnitrace-base-rocm-4.3 4.3.0 ${CODE_VERSION} +build-release jrmadsen/omnitrace-base-rocm-4.3.1 4.3.1 ${CODE_VERSION} diff --git a/projects/rocprofiler-systems/docker/build-docker.sh b/projects/rocprofiler-systems/docker/build-docker.sh new file mode 100755 index 0000000000..8795ef2df0 --- /dev/null +++ b/projects/rocprofiler-systems/docker/build-docker.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +: ${ROCM_VERSIONS:="4.5 4.3 4.3.1"} + +for i in ${ROCM_VERSIONS} +do + docker build . --tag jrmadsen/omnitrace-base-rocm-${i} --build-arg ROCM_REPO_VERSION=${i} +done diff --git a/projects/rocprofiler-systems/examples/CMakeLists.txt b/projects/rocprofiler-systems/examples/CMakeLists.txt index bcd71d34c5..85a7cbd2fc 100644 --- a/projects/rocprofiler-systems/examples/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/CMakeLists.txt @@ -7,3 +7,7 @@ set(CMAKE_CXX_VISIBILITY_PRESET "default") add_subdirectory(transpose) add_subdirectory(parallel-overhead) + +option(BUILD_SHARED_LIBS "Build dynamic libraries" ON) + +add_subdirectory(lulesh) diff --git a/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt b/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt new file mode 100644 index 0000000000..f90d576d46 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) + +project(lulesh LANGUAGES C CXX) + +list(INSERT CMAKE_MODULE_PATH 0 ${PROJECT_SOURCE_DIR}/cmake/Modules) + +add_subdirectory(external) + +set(CMAKE_CXX_EXTENSIONS OFF) + +if("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE + "RelWithDebInfo" + CACHE STRING "CMake build type" FORCE) +endif() + +if(DEFINED OMNITRACE_USE_MPI) + option(LULESH_USE_MPI "Enable MPI" ${OMNITRACE_USE_MPI}) +else() + option(LULESH_USE_MPI "Enable MPI" OFF) +endif() + +add_library(lulesh-mpi INTERFACE) +if(LULESH_USE_MPI) + find_package(MPI REQUIRED) + target_compile_definitions(lulesh-mpi INTERFACE USE_MPI=1) + target_link_libraries(lulesh-mpi INTERFACE MPI::MPI_C MPI::MPI_CXX) +else() + target_compile_definitions(lulesh-mpi INTERFACE USE_MPI=0) +endif() + +if(NOT TARGET Kokkos::kokkos) + find_package(Kokkos REQUIRED) +endif() + +file(GLOB headers ${PROJECT_SOURCE_DIR}/*.h ${PROJECT_SOURCE_DIR}/*.hxx) +file(GLOB sources ${PROJECT_SOURCE_DIR}/*.cc) + +add_executable(${PROJECT_NAME} ${sources} ${headers}) +target_include_directories(${PROJECT_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/includes) +target_link_libraries(${PROJECT_NAME} PRIVATE Kokkos::kokkos lulesh-mpi) + +if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}) +endif() + +enable_testing() +if(LULESH_USE_MPI) + add_test( + NAME lulesh + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 8 + $ -i 100 -s 20 -p + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) +else() + add_test( + NAME lulesh + COMMAND $ -i 100 -s 20 -p + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) +endif() diff --git a/projects/rocprofiler-systems/examples/lulesh/cmake/Modules/Utilities.cmake b/projects/rocprofiler-systems/examples/lulesh/cmake/Modules/Utilities.cmake new file mode 100644 index 0000000000..3a7f55da23 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/cmake/Modules/Utilities.cmake @@ -0,0 +1,315 @@ +# include guard +include_guard(DIRECTORY) + +# MacroUtilities - useful macros and functions for generic tasks +# + +include(CMakeDependentOption) +include(CMakeParseArguments) + +# ----------------------------------------------------------------------- +# function - capitalize - make a string capitalized (first letter is capital) usage: +# capitalize("SHARED" CShared) message(STATUS "-- CShared is \"${CShared}\"") $ -- CShared +# is "Shared" +function(CAPITALIZE str var) + # make string lower + string(TOLOWER "${str}" str) + string(SUBSTRING "${str}" 0 1 _first) + string(TOUPPER "${_first}" _first) + string(SUBSTRING "${str}" 1 -1 _remainder) + string(CONCAT str "${_first}" "${_remainder}") + set(${var} + "${str}" + PARENT_SCOPE) +endfunction() + +# ----------------------------------------------------------------------------------------# +# macro CHECKOUT_GIT_SUBMODULE() +# +# Run "git submodule update" if a file in a submodule does not exist +# +# ARGS: RECURSIVE (option) -- add "--recursive" flag RELATIVE_PATH (one value) -- +# typically the relative path to submodule from PROJECT_SOURCE_DIR WORKING_DIRECTORY (one +# value) -- (default: PROJECT_SOURCE_DIR) TEST_FILE (one value) -- file to check for +# (default: CMakeLists.txt) ADDITIONAL_CMDS (many value) -- any addition commands to pass +# +function(CHECKOUT_GIT_SUBMODULE) + # parse args + cmake_parse_arguments( + CHECKOUT "RECURSIVE" + "RELATIVE_PATH;WORKING_DIRECTORY;TEST_FILE;REPO_URL;REPO_BRANCH" + "ADDITIONAL_CMDS" ${ARGN}) + + if(NOT CHECKOUT_WORKING_DIRECTORY) + set(CHECKOUT_WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + endif() + + if(NOT CHECKOUT_TEST_FILE) + set(CHECKOUT_TEST_FILE "Makefile") + endif() + + # default assumption + if(NOT CHECKOUT_REPO_BRANCH) + set(CHECKOUT_REPO_BRANCH "master") + endif() + + find_package(Git) + set(_DIR "${CHECKOUT_WORKING_DIRECTORY}/${CHECKOUT_RELATIVE_PATH}") + # ensure the (possibly empty) directory exists + if(NOT EXISTS "${_DIR}") + if(NOT CHECKOUT_REPO_URL) + message(FATAL_ERROR "submodule directory does not exist") + endif() + endif() + + # if this file exists --> project has been checked out if not exists --> not been + # checked out + set(_TEST_FILE "${_DIR}/${CHECKOUT_TEST_FILE}") + # assuming a .gitmodules file exists + set(_SUBMODULE "${PROJECT_SOURCE_DIR}/.gitmodules") + + set(_TEST_FILE_EXISTS OFF) + if(EXISTS "${_TEST_FILE}" AND NOT IS_DIRECTORY "${_TEST_FILE}") + set(_TEST_FILE_EXISTS ON) + endif() + + if(_TEST_FILE_EXISTS) + return() + endif() + + find_package(Git REQUIRED) + + set(_SUBMODULE_EXISTS OFF) + if(EXISTS "${_SUBMODULE}" AND NOT IS_DIRECTORY "${_SUBMODULE}") + set(_SUBMODULE_EXISTS ON) + endif() + + set(_HAS_REPO_URL OFF) + if(NOT "${CHECKOUT_REPO_URL}" STREQUAL "") + set(_HAS_REPO_URL ON) + endif() + + # if the module has not been checked out + if(NOT _TEST_FILE_EXISTS AND _SUBMODULE_EXISTS) + # perform the checkout + execute_process( + COMMAND ${GIT_EXECUTABLE} submodule update --init ${_RECURSE} + ${CHECKOUT_ADDITIONAL_CMDS} ${CHECKOUT_RELATIVE_PATH} + WORKING_DIRECTORY ${CHECKOUT_WORKING_DIRECTORY} + RESULT_VARIABLE RET) + + # check the return code + if(RET GREATER 0) + set(_CMD "${GIT_EXECUTABLE} submodule update --init ${_RECURSE} + ${CHECKOUT_ADDITIONAL_CMDS} ${CHECKOUT_RELATIVE_PATH}") + message(STATUS "function(CHECKOUT_GIT_SUBMODULE) failed.") + message(FATAL_ERROR "Command: \"${_CMD}\"") + else() + set(_TEST_FILE_EXISTS ON) + endif() + endif() + + if(NOT _TEST_FILE_EXISTS AND _HAS_REPO_URL) + message( + STATUS "Checking out '${CHECKOUT_REPO_URL}' @ '${CHECKOUT_REPO_BRANCH}'...") + + # remove the existing directory + if(EXISTS "${_DIR}") + execute_process(COMMAND ${CMAKE_COMMAND} -E remove_directory ${_DIR}) + endif() + + # perform the checkout + execute_process( + COMMAND + ${GIT_EXECUTABLE} clone -b ${CHECKOUT_REPO_BRANCH} + ${CHECKOUT_ADDITIONAL_CMDS} ${CHECKOUT_REPO_URL} ${CHECKOUT_RELATIVE_PATH} + WORKING_DIRECTORY ${CHECKOUT_WORKING_DIRECTORY} + RESULT_VARIABLE RET) + + # perform the submodule update + if(CHECKOUT_RECURSIVE + AND EXISTS "${_DIR}" + AND IS_DIRECTORY "${_DIR}") + execute_process( + COMMAND ${GIT_EXECUTABLE} submodule update --init ${_RECURSE} + WORKING_DIRECTORY ${_DIR} + RESULT_VARIABLE RET) + endif() + + # check the return code + if(RET GREATER 0) + set(_CMD + "${GIT_EXECUTABLE} clone -b ${CHECKOUT_REPO_BRANCH} + ${CHECKOUT_ADDITIONAL_CMDS} ${CHECKOUT_REPO_URL} ${CHECKOUT_RELATIVE_PATH}" + ) + message(STATUS "function(CHECKOUT_GIT_SUBMODULE) failed.") + message(FATAL_ERROR "Command: \"${_CMD}\"") + else() + set(_TEST_FILE_EXISTS ON) + endif() + endif() + + if(NOT EXISTS "${_TEST_FILE}" OR NOT _TEST_FILE_EXISTS) + message( + FATAL_ERROR + "Error checking out submodule: '${CHECKOUT_RELATIVE_PATH}' to '${_DIR}'") + endif() + +endfunction() + +# ----------------------------------------------------------------------------------------# +# require variable +# +function(CHECK_REQUIRED VAR) + if(NOT DEFINED ${VAR} OR "${${VAR}}" STREQUAL "") + message(FATAL_ERROR "Variable '${VAR}' must be defined and not empty") + endif() +endfunction() + +# ----------------------------------------------------------------------- +# function add_feature( ) Add a project feature, whose activation is +# specified by the existence of the variable , to the list of enabled/disabled +# features, plus a docstring describing the feature +# +function(ADD_FEATURE _var _description) + set(EXTRA_DESC "") + foreach(currentArg ${ARGN}) + if(NOT "${currentArg}" STREQUAL "${_var}" AND NOT "${currentArg}" STREQUAL + "${_description}") + set(EXTRA_DESC "${EXTA_DESC}${currentArg}") + endif() + endforeach() + + set_property(GLOBAL APPEND PROPERTY ${PROJECT_NAME}_FEATURES ${_var}) + set_property(GLOBAL PROPERTY ${_var}_DESCRIPTION "${_description}${EXTRA_DESC}") + + if("CMAKE_DEFINE" IN_LIST ARGN) + set_property(GLOBAL APPEND PROPERTY ${PROJECT_NAME}_CMAKE_DEFINES + "${_var} @${_var}@") + endif() +endfunction() + +# ----------------------------------------------------------------------------------------# +# function add_option( [NO_FEATURE]) Add an +# option and add as a feature if NO_FEATURE is not provided +# +function(ADD_OPTION _NAME _MESSAGE _DEFAULT) + option(${_NAME} "${_MESSAGE}" ${_DEFAULT}) + if("NO_FEATURE" IN_LIST ARGN) + mark_as_advanced(${_NAME}) + else() + add_feature(${_NAME} "${_MESSAGE}") + endif() + if("ADVANCED" IN_LIST ARGN) + mark_as_advanced(${_NAME}) + endif() +endfunction() + +# ----------------------------------------------------------------------------------------# +# function print_enabled_features() Print enabled features plus their docstrings. +# +function(PRINT_ENABLED_FEATURES) + set(_basemsg "The following features are defined/enabled (+):") + set(_currentFeatureText "${_basemsg}") + get_property(_features GLOBAL PROPERTY ${PROJECT_NAME}_FEATURES) + if(NOT "${_features}" STREQUAL "") + list(REMOVE_DUPLICATES _features) + list(SORT _features) + endif() + foreach(_feature ${_features}) + if(${_feature}) + # add feature to text + set(_currentFeatureText "${_currentFeatureText}\n ${_feature}") + # get description + get_property(_desc GLOBAL PROPERTY ${_feature}_DESCRIPTION) + # print description, if not standard ON/OFF, print what is set to + if(_desc) + if(NOT "${${_feature}}" STREQUAL "ON" AND NOT "${${_feature}}" STREQUAL + "TRUE") + set(_currentFeatureText + "${_currentFeatureText}: ${_desc} -- [\"${${_feature}}\"]") + else() + string(REGEX REPLACE "^${PROJECT_NAME}_USE_" "" _feature_tmp + "${_feature}") + string(TOLOWER "${_feature_tmp}" _feature_tmp_l) + capitalize("${_feature_tmp}" _feature_tmp_c) + foreach(_var _feature _feature_tmp _feature_tmp_l _feature_tmp_c) + set(_ver "${${${_var}}_VERSION}") + if(NOT "${_ver}" STREQUAL "") + set(_desc "${_desc} -- [found version ${_ver}]") + break() + endif() + unset(_ver) + endforeach() + set(_currentFeatureText "${_currentFeatureText}: ${_desc}") + endif() + set(_desc NOTFOUND) + endif() + endif() + endforeach() + + if(NOT "${_currentFeatureText}" STREQUAL "${_basemsg}") + message(STATUS "${_currentFeatureText}\n") + endif() +endfunction() + +# ----------------------------------------------------------------------------------------# +# function print_disabled_features() Print disabled features plus their docstrings. +# +function(PRINT_DISABLED_FEATURES) + set(_basemsg "The following features are NOT defined/enabled (-):") + set(_currentFeatureText "${_basemsg}") + get_property(_features GLOBAL PROPERTY ${PROJECT_NAME}_FEATURES) + if(NOT "${_features}" STREQUAL "") + list(REMOVE_DUPLICATES _features) + list(SORT _features) + endif() + foreach(_feature ${_features}) + if(NOT ${_feature}) + set(_currentFeatureText "${_currentFeatureText}\n ${_feature}") + get_property(_desc GLOBAL PROPERTY ${_feature}_DESCRIPTION) + if(_desc) + set(_currentFeatureText "${_currentFeatureText}: ${_desc}") + set(_desc NOTFOUND) + endif(_desc) + endif() + endforeach(_feature) + + if(NOT "${_currentFeatureText}" STREQUAL "${_basemsg}") + message(STATUS "${_currentFeatureText}\n") + endif() +endfunction() + +# ----------------------------------------------------------------------------------------# +# function print_features() Print all features plus their docstrings. +# +function(PRINT_FEATURES) + message(STATUS "") + print_enabled_features() + print_disabled_features() +endfunction() + +# ----------------------------------------------------------------------------------------# +# macro ADD_SUBPROJECT() Does a git submodule update + add_subdirectory +# +macro(ADD_SUBPROJECT PACKAGE_NAME) + # parse args + cmake_parse_arguments(PACKAGE "SUBMODULE" "DIRECTORY" "" ${ARGN}) + if(NOT PACKAGE_DIRECTORY) + set(PACKAGE_DIRECTORY ${PACKAGE_NAME}) + endif() + # if specified in options + if("${PACKAGE_NAME}" IN_LIST PROJECTS) + if(PACKAGE_SUBMODULE) + checkout_git_submodule(RECURSIVE RELATIVE_PATH ${PACKAGE_DIRECTORY}) + endif() + if(NOT EXISTS "${PROJECT_SOURCE_DIR}/${PACKAGE_DIRECTORY}/CMakeLists.txt") + message( + STATUS + "Warning! '${PROJECT_SOURCE_DIR}/${PACKAGE_DIRECTORY}/CMakeLists.txt' does not exist!" + ) + else() + add_subdirectory(${PACKAGE_DIRECTORY}) + endif() + endif() +endmacro() diff --git a/projects/rocprofiler-systems/examples/lulesh/external/CMakeLists.txt b/projects/rocprofiler-systems/examples/lulesh/external/CMakeLists.txt new file mode 100644 index 0000000000..7ede9ec559 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/external/CMakeLists.txt @@ -0,0 +1,28 @@ +set(Kokkos_ENABLE_SERIAL + ON + CACHE BOOL "Enable Serial") +set(Kokkos_ENABLE_OPENMP + ON + CACHE BOOL "Enable OpenMP") +if(USE_CUDA) + set(Kokkos_ENABLE_CUDA + ON + CACHE BOOL "Enable CUDA") + set(Kokkos_ENABLE_CUDA_UVM + ON + CACHE BOOL "Enable CUDA UVM") + set(Kokkos_ENABLE_CUDA_LAMBDA + ON + CACHE BOOL "Enable CUDA UVM") + set(Kokkos_ENABLE_CUDA_CONSTEXPR + ON + CACHE BOOL "Enable CUDA UVM") +endif() + +checkout_git_submodule( + RELATIVE_PATH external/kokkos WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} REPO_URL + https://github.com/kokkos/kokkos.git REPO_BRANCH develop) + +set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY ON) + +add_subdirectory(kokkos) diff --git a/projects/rocprofiler-systems/examples/lulesh/external/kokkos b/projects/rocprofiler-systems/examples/lulesh/external/kokkos new file mode 160000 index 0000000000..56468253ef --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/external/kokkos @@ -0,0 +1 @@ +Subproject commit 56468253ef601e6968dd1e6714dea8ee26a561c3 diff --git a/projects/rocprofiler-systems/examples/lulesh/includes/Timer.hxx b/projects/rocprofiler-systems/examples/lulesh/includes/Timer.hxx new file mode 100644 index 0000000000..55902c3632 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/includes/Timer.hxx @@ -0,0 +1,127 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file for simple class that can be used to + * time code sections. + * + * \author Rich Hornung, Center for Applied Scientific Computing, LLNL + * \author Jeff Keasler, Applications, Simulations And Quality, LLNL + * + ****************************************************************************** + */ + +#ifndef RAJA_Timer_HXX +#define RAJA_Timer_HXX + +#if defined(RAJA_USE_CYCLE) +# include "./cycle.h" +typedef ticks TimeType; + +#elif defined(RAJA_USE_CLOCK) +# include +typedef clock_t TimeType; + +#elif defined(RAJA_USE_GETTIME) +# include +typedef timespec TimeType; + +#else +# error RAJA_TIMER_TYPE is undefined! + +#endif + +namespace RAJA +{ +/*! + ****************************************************************************** + * + * \brief Simple timer class to time code sections. + * + ****************************************************************************** + */ +class Timer +{ +public: +#if defined(RAJA_USE_CYCLE) || defined(RAJA_USE_CLOCK) + Timer() + : telapsed(0) + { + ; + } +#endif +#if defined(RAJA_USE_GETTIME) + Timer() + : telapsed(0) + , stime_elapsed(0) + , nstime_elapsed(0) + { + ; + } +#endif + +#if defined(RAJA_USE_CYCLE) + void start() { tstart = getticks(); } + void stop() + { + tstop = getticks(); + set_elapsed(); + } + + long double elapsed() { return static_cast(telapsed); } +#endif + +#if defined(RAJA_USE_CLOCK) + void start() { tstart = clock(); } + void stop() + { + tstop = clock(); + set_elapsed(); + } + + long double elapsed() { return static_cast(telapsed) / CLOCKS_PER_SEC; } +#endif + +#if defined(RAJA_USE_GETTIME) + +# if 0 + void start() { clock_gettime(CLOCK_REALTIME, &tstart); } + void stop() { clock_gettime(CLOCK_REALTIME, &tstop); set_elapsed(); } +# else + void start() { clock_gettime(CLOCK_MONOTONIC, &tstart); } + void stop() + { + clock_gettime(CLOCK_MONOTONIC, &tstop); + set_elapsed(); + } +# endif + + long double elapsed() { return (stime_elapsed + nstime_elapsed); } + +#endif + +private: + TimeType tstart; + TimeType tstop; + long double telapsed; + +#if defined(RAJA_USE_CYCLE) || defined(RAJA_USE_CLOCK) + void set_elapsed() { telapsed += (tstop - tstart); } + +#elif defined(RAJA_USE_GETTIME) + long double stime_elapsed; + long double nstime_elapsed; + + void set_elapsed() + { + stime_elapsed += static_cast(tstop.tv_sec - tstart.tv_sec); + nstime_elapsed += + static_cast(tstop.tv_nsec - tstart.tv_nsec) / 1000000000.0; + } +#endif +}; + +} // namespace RAJA + +#endif // closing endif for header file include guard diff --git a/projects/rocprofiler-systems/examples/lulesh/includes/cycle.h b/projects/rocprofiler-systems/examples/lulesh/includes/cycle.h new file mode 100644 index 0000000000..fc90d38afe --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/includes/cycle.h @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2003, 2007-8 Matteo Frigo + * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/* machine-dependent cycle counters code. Needs to be inlined. */ + +/***************************************************************************/ +/* To use the cycle counters in your code, simply #include "cycle.h" (this + file), and then use the functions/macros: + + ticks getticks(void); + + ticks is an opaque typedef defined below, representing the current time. + You extract the elapsed time between two calls to gettick() via: + + double elapsed(ticks t1, ticks t0); + + which returns a double-precision variable in arbitrary units. You + are not expected to convert this into human units like seconds; it + is intended only for *comparisons* of time intervals. + + (In order to use some of the OS-dependent timer routines like + Solaris' gethrtime, you need to paste the autoconf snippet below + into your configure.ac file and #include "config.h" before cycle.h, + or define the relevant macros manually if you are not using autoconf.) +*/ + +/***************************************************************************/ +/* This file uses macros like HAVE_GETHRTIME that are assumed to be + defined according to whether the corresponding function/type/header + is available on your system. The necessary macros are most + conveniently defined if you are using GNU autoconf, via the tests: + + dnl --------------------------------------------------------------------- + + AC_C_INLINE + AC_HEADER_TIME + AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) + + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is +defined in ])],,[#if HAVE_SYS_TIME_H #include #endif]) + + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime +mach_absolute_time]) + + dnl Cray UNICOS _rtc() (real-time clock) intrinsic + AC_MSG_CHECKING([for _rtc intrinsic]) + rtc_ok=yes + AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H +#include +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() +intrinsic.])], [rtc_ok=no]) AC_MSG_RESULT($rtc_ok) + + dnl --------------------------------------------------------------------- +*/ + +/***************************************************************************/ + +#if TIME_WITH_SYS_TIME +# include +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#define INLINE_ELAPSED(INL) \ + static INL double elapsed(ticks t1, ticks t0) { return (double) t1 - (double) t0; } + +/*----------------------------------------------------------------*/ +/* Solaris */ +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +typedef hrtime_t ticks; + +# define getticks gethrtime + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* AIX v. 4+ routines to read the real-time clock or time-base register */ +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && \ + !defined(HAVE_TICK_COUNTER) +typedef timebasestruct_t ticks; + +static __inline ticks +getticks(void) +{ + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; +} + +static __inline double +elapsed(ticks t1, ticks t0) /* time in nanoseconds */ +{ + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return (((double) t1.tb_high - (double) t0.tb_high) * 1.0e9 + + ((double) t1.tb_low - (double) t0.tb_low)); +} + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PowerPC ``cycle'' counter using the time base register. + */ +#if((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || \ + (defined(__MWERKS__) && defined(macintosh)))) || \ + (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && \ + !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks +getticks(void) +{ + unsigned int tbl, tbu0, tbu1; + + do + { + __asm__ __volatile__("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__("mftb %0" : "=r"(tbl)); + __asm__ __volatile__("mftbu %0" : "=r"(tbu1)); + } while(tbu0 != tbu1); + + return (((unsigned long long) tbu0) << 32) | tbl; +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, + from Carbon, requires no additional libraries to be linked). */ +#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && \ + !defined(HAVE_TICK_COUNTER) +# include +typedef uint64_t ticks; +# define getticks mach_absolute_time +INLINE_ELAPSED(__inline__) +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * Pentium cycle counter + */ +#if(defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && \ + !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks +getticks(void) +{ + ticks ret; + + __asm__ __volatile__("rdtsc" : "=A"(ret)); + /* no input, nothing else clobbered */ + return ret; +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +# define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/* Visual C++ -- thanks to Morten Nissov for his help with this */ +#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) +# include +typedef LARGE_INTEGER ticks; +# define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ + +static __inline ticks +getticks(void) +{ + ticks retval; + + __asm { + RDTSC + mov retval.HighPart, edx + mov retval.LowPart, eax + } + return retval; +} + +static __inline double +elapsed(ticks t1, ticks t0) +{ + return (double) t1.QuadPart - (double) t0.QuadPart; +} + +# define HAVE_TICK_COUNTER +# define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/*----------------------------------------------------------------*/ +/* + * X86-64 cycle counter + */ +#if(defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && \ + defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks +getticks(void) +{ + unsigned a, d; + __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); + return ((ticks) a) | (((ticks) d) << 32); +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. + NOTE: this code will fail to link unless you use the -Masmkeyword compiler + option (grrr). */ +#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; +static ticks +getticks(void) +{ + asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); +} +INLINE_ELAPSED(__inline__) +# define HAVE_TICK_COUNTER +#endif + +/* Visual C++, courtesy of Dirk Michaelis */ +#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && \ + !defined(HAVE_TICK_COUNTER) + +# include +# pragma intrinsic(__rdtsc) +typedef unsigned __int64 ticks; +# define getticks __rdtsc +INLINE_ELAPSED(__inline) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * IA64 cycle counter + */ + +/* intel's icc/ecc compiler */ +#if(defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && \ + !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; +# include + +static __inline__ ticks +getticks(void) +{ + return __getReg(_IA64_REG_AR_ITC); +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/* gcc */ +#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks +getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mov %0=ar.itc" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ +#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) +# include +typedef unsigned long ticks; + +static inline ticks +getticks(void) +{ + ticks ret; + + ret = _Asm_mov_from_ar(_AREG_ITC); + return ret; +} + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +#endif + +/* Microsoft Visual C++ */ +#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) +typedef unsigned __int64 ticks; + +# ifdef __cplusplus +extern "C" +# endif + ticks + __getReg(int whichReg); +# pragma intrinsic(__getReg) + +static __inline ticks +getticks(void) +{ + volatile ticks temp; + temp = __getReg(3116); + return temp; +} + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PA-RISC cycle counter + */ +#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +# ifdef __GNUC__ +static __inline__ ticks +getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mfctl 16, %0" : "=r"(ret)); + /* no input, nothing else clobbered */ + return ret; +} +# else +# include +static inline unsigned long +getticks(void) +{ + register ticks ret; + _MFCTL(16, ret); + return ret; +} +# endif + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* S390, courtesy of James Treacy */ +#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks +getticks(void) +{ + ticks cycles; + __asm__("stck 0(%0)" : : "a"(&(cycles)) : "memory", "cc"); + return cycles; +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) +/* + * The 32-bit cycle counter on alpha overflows pretty quickly, + * unfortunately. A 1GHz machine overflows in 4 seconds. + */ +typedef unsigned int ticks; + +static __inline__ ticks +getticks(void) +{ + unsigned long cc; + __asm__ __volatile__("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks +getticks(void) +{ + ticks ret; + __asm__ __volatile__("rd %%tick, %0" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if(defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && \ + defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +# include +typedef unsigned int ticks; + +static __inline ticks +getticks(void) +{ + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline) + +# define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* SGI/Irix */ +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +typedef struct timespec ticks; + +static inline ticks +getticks(void) +{ + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; +} + +static inline double +elapsed(ticks t1, ticks t0) +{ + return ((double) t1.tv_sec - (double) t0.tv_sec) * 1.0E9 + + ((double) t1.tv_nsec - (double) t0.tv_nsec); +} +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* Cray UNICOS _rtc() intrinsic function */ +#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) +# ifdef HAVE_INTRINSICS_H +# include +# endif + +typedef long long ticks; + +# define getticks _rtc + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* MIPS ZBus */ +#if HAVE_MIPS_ZBUS_TIMER +# if defined(__mips__) && !defined(HAVE_TICK_COUNTER) +# include +# include +# include + +typedef uint64_t ticks; + +static inline ticks +getticks(void) +{ + static uint64_t* addr = 0; + + if(addr == 0) + { + uint32_t rq_addr = 0x10030000; + int fd; + int pgsize; + + pgsize = getpagesize(); + fd = open("/dev/mem", O_RDONLY | O_SYNC, 0); + if(fd < 0) + { + perror("open"); + return NULL; + } + addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); + close(fd); + if(addr == (uint64_t*) -1) + { + perror("mmap"); + return NULL; + } + } + + return *addr; +} + +INLINE_ELAPSED(inline) + +# define HAVE_TICK_COUNTER +# endif +#endif /* HAVE_MIPS_ZBUS_TIMER */ diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh-comm.cc b/projects/rocprofiler-systems/examples/lulesh/lulesh-comm.cc new file mode 100644 index 0000000000..c083bd3900 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh-comm.cc @@ -0,0 +1,2073 @@ +#include "lulesh.h" + +// If no MPI, then this whole file is stubbed out +#if USE_MPI + +# include +# include + +/* Comm Routines */ + +# define ALLOW_UNPACKED_PLANE false +# define ALLOW_UNPACKED_ROW false +# define ALLOW_UNPACKED_COL false + +/* + There are coherence issues for packing and unpacking message + buffers. Ideally, you would like a lot of threads to + cooperate in the assembly/dissassembly of each message. + To do that, each thread should really be operating in a + different coherence zone. + + Let's assume we have three fields, f1 through f3, defined on + a 61x61x61 cube. If we want to send the block boundary + information for each field to each neighbor processor across + each cube face, then we have three cases for the + memory layout/coherence of data on each of the six cube + boundaries: + + (a) Two of the faces will be in contiguous memory blocks + (b) Two of the faces will be comprised of pencils of + contiguous memory. + (c) Two of the faces will have large strides between + every value living on the face. + + How do you pack and unpack this data in buffers to + simultaneous achieve the best memory efficiency and + the most thread independence? + + Do do you pack field f1 through f3 tighly to reduce message + size? Do you align each field on a cache coherence boundary + within the message so that threads can pack and unpack each + field independently? For case (b), do you align each + boundary pencil of each field separately? This increases + the message size, but could improve cache coherence so + each pencil could be processed independently by a separate + thread with no conflicts. + + Also, memory access for case (c) would best be done without + going through the cache (the stride is so large it just causes + a lot of useless cache evictions). Is it worth creating + a special case version of the packing algorithm that uses + non-coherent load/store opcodes? +*/ + +/******************************************/ + +/* doRecv flag only works with regular block structure */ +void +CommRecv(Domain& domain, int msgType, Index_t xferFields, Index_t dx, Index_t dy, + Index_t dz, bool doRecv, bool planeOnly) +{ + if(domain.numRanks() == 1) + return; + + /* post recieve buffers for all incoming messages */ + int myRank; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize(); + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize(); + Index_t pmsg = 0; /* plane comm msg */ + Index_t emsg = 0; /* edge comm msg */ + Index_t cmsg = 0; /* corner comm msg */ + MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE); + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax; + + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true; + + if(domain.rowLoc() == 0) + { + rowMin = false; + } + if(domain.rowLoc() == (domain.tp() - 1)) + { + rowMax = false; + } + if(domain.colLoc() == 0) + { + colMin = false; + } + if(domain.colLoc() == (domain.tp() - 1)) + { + colMax = false; + } + if(domain.planeLoc() == 0) + { + planeMin = false; + } + if(domain.planeLoc() == (domain.tp() - 1)) + { + planeMax = false; + } + + for(Index_t i = 0; i < 26; ++i) + { + domain.recvRequest[i] = MPI_REQUEST_NULL; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + + /* post receives */ + + /* receive data from neighboring domain faces */ + if(planeMin && doRecv) + { + /* contiguous memory */ + int fromRank = myRank - domain.tp() * domain.tp(); + int recvCount = dx * dy * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + if(planeMax) + { + /* contiguous memory */ + int fromRank = myRank + domain.tp() * domain.tp(); + int recvCount = dx * dy * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + if(rowMin && doRecv) + { + /* semi-contiguous memory */ + int fromRank = myRank - domain.tp(); + int recvCount = dx * dz * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + if(rowMax) + { + /* semi-contiguous memory */ + int fromRank = myRank + domain.tp(); + int recvCount = dx * dz * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + if(colMin && doRecv) + { + /* scattered memory */ + int fromRank = myRank - 1; + int recvCount = dy * dz * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + if(colMax) + { + /* scattered memory */ + int fromRank = myRank + 1; + int recvCount = dy * dz * xferFields; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], recvCount, baseType, + fromRank, msgType, MPI_COMM_WORLD, &domain.recvRequest[pmsg]); + ++pmsg; + } + + if(!planeOnly) + { + /* receive data from domains connected only by an edge */ + if(rowMin && colMin && doRecv) + { + int fromRank = myRank - domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && planeMin && doRecv) + { + int fromRank = myRank - domain.tp() * domain.tp() - domain.tp(); + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(colMin && planeMin && doRecv) + { + int fromRank = myRank - domain.tp() * domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && colMax) + { + int fromRank = myRank + domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && planeMax) + { + int fromRank = myRank + domain.tp() * domain.tp() + domain.tp(); + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(colMax && planeMax) + { + int fromRank = myRank + domain.tp() * domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && colMin) + { + int fromRank = myRank + domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && planeMax) + { + int fromRank = myRank + domain.tp() * domain.tp() - domain.tp(); + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(colMin && planeMax) + { + int fromRank = myRank + domain.tp() * domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && colMax && doRecv) + { + int fromRank = myRank - domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && planeMin && doRecv) + { + int fromRank = myRank - domain.tp() * domain.tp() + domain.tp(); + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + if(colMax && planeMin && doRecv) + { + int fromRank = myRank - domain.tp() * domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg]); + ++emsg; + } + + /* receive data from domains connected only by a corner */ + if(rowMin && colMin && planeMin && doRecv) + { + /* corner at domain logical coord (0, 0, 0) */ + int fromRank = myRank - domain.tp() * domain.tp() - domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMin && planeMax) + { + /* corner at domain logical coord (0, 0, 1) */ + int fromRank = myRank + domain.tp() * domain.tp() - domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMax && planeMin && doRecv) + { + /* corner at domain logical coord (1, 0, 0) */ + int fromRank = myRank - domain.tp() * domain.tp() - domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMax && planeMax) + { + /* corner at domain logical coord (1, 0, 1) */ + int fromRank = myRank + domain.tp() * domain.tp() - domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMin && planeMin && doRecv) + { + /* corner at domain logical coord (0, 1, 0) */ + int fromRank = myRank - domain.tp() * domain.tp() + domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMin && planeMax) + { + /* corner at domain logical coord (0, 1, 1) */ + int fromRank = myRank + domain.tp() * domain.tp() + domain.tp() - 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMax && planeMin && doRecv) + { + /* corner at domain logical coord (1, 1, 0) */ + int fromRank = myRank - domain.tp() * domain.tp() + domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMax && planeMax) + { + /* corner at domain logical coord (1, 1, 1) */ + int fromRank = myRank + domain.tp() * domain.tp() + domain.tp() + 1; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, MPI_COMM_WORLD, + &domain.recvRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + } +} + +/******************************************/ + +void +CommSend(Domain& domain, int msgType, Index_t xferFields, Domain_member* fieldData, + Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly) +{ + if(domain.numRanks() == 1) + return; + + /* post recieve buffers for all incoming messages */ + int myRank; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize(); + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize(); + Index_t pmsg = 0; /* plane comm msg */ + Index_t emsg = 0; /* edge comm msg */ + Index_t cmsg = 0; /* corner comm msg */ + MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE); + MPI_Status status[26]; + Real_t* destAddr; + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax; + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true; + if(domain.rowLoc() == 0) + { + rowMin = false; + } + if(domain.rowLoc() == (domain.tp() - 1)) + { + rowMax = false; + } + if(domain.colLoc() == 0) + { + colMin = false; + } + if(domain.colLoc() == (domain.tp() - 1)) + { + colMax = false; + } + if(domain.planeLoc() == 0) + { + planeMin = false; + } + if(domain.planeLoc() == (domain.tp() - 1)) + { + planeMax = false; + } + + for(Index_t i = 0; i < 26; ++i) + { + domain.sendRequest[i] = MPI_REQUEST_NULL; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + + /* post sends */ + + if(planeMin | planeMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + int sendCount = dx * dy; + + if(planeMin) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < sendCount; ++i) + { + destAddr[i] = (domain.*src)(i); + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, + myRank - domain.tp() * domain.tp(), msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg]); + ++pmsg; + } + if(planeMax && doSend) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < sendCount; ++i) + { + destAddr[i] = (domain.*src)(dx * dy * (dz - 1) + i); + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, + myRank + domain.tp() * domain.tp(), msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg]); + ++pmsg; + } + } + if(rowMin | rowMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + int sendCount = dx * dz; + + if(rowMin) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + destAddr[i * dx + j] = (domain.*src)(i * dx * dy + j); + } + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, myRank - domain.tp(), + msgType, MPI_COMM_WORLD, &domain.sendRequest[pmsg]); + ++pmsg; + } + if(rowMax && doSend) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + destAddr[i * dx + j] = + (domain.*src)(dx * (dy - 1) + i * dx * dy + j); + } + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, myRank + domain.tp(), + msgType, MPI_COMM_WORLD, &domain.sendRequest[pmsg]); + ++pmsg; + } + } + if(colMin | colMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + int sendCount = dy * dz; + + if(colMin) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + destAddr[i * dy + j] = (domain.*src)(i * dx * dy + j * dx); + } + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, myRank - 1, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg]); + ++pmsg; + } + if(colMax && doSend) + { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + destAddr[i * dy + j] = + (domain.*src)(dx - 1 + i * dx * dy + j * dx); + } + } + destAddr += sendCount; + } + destAddr -= xferFields * sendCount; + + MPI_Isend(destAddr, xferFields * sendCount, baseType, myRank + 1, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg]); + ++pmsg; + } + } + + if(!planeOnly) + { + if(rowMin && colMin) + { + int toRank = myRank - domain.tp() - 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + destAddr[i] = (domain.*src)(i * dx * dy); + } + destAddr += dz; + } + destAddr -= xferFields * dz; + MPI_Isend(destAddr, xferFields * dz, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && planeMin) + { + int toRank = myRank - domain.tp() * domain.tp() - domain.tp(); + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + destAddr[i] = (domain.*src)(i); + } + destAddr += dx; + } + destAddr -= xferFields * dx; + MPI_Isend(destAddr, xferFields * dx, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(colMin && planeMin) + { + int toRank = myRank - domain.tp() * domain.tp() - 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + destAddr[i] = (domain.*src)(i * dx); + } + destAddr += dy; + } + destAddr -= xferFields * dy; + MPI_Isend(destAddr, xferFields * dy, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && colMax && doSend) + { + int toRank = myRank + domain.tp() + 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + destAddr[i] = (domain.*src)(dx * dy - 1 + i * dx * dy); + } + destAddr += dz; + } + destAddr -= xferFields * dz; + MPI_Isend(destAddr, xferFields * dz, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && planeMax && doSend) + { + int toRank = myRank + domain.tp() * domain.tp() + domain.tp(); + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + destAddr[i] = (domain.*src)(dx * (dy - 1) + dx * dy * (dz - 1) + i); + } + destAddr += dx; + } + destAddr -= xferFields * dx; + MPI_Isend(destAddr, xferFields * dx, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(colMax && planeMax && doSend) + { + int toRank = myRank + domain.tp() * domain.tp() + 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + destAddr[i] = (domain.*src)(dx * dy * (dz - 1) + dx - 1 + i * dx); + } + destAddr += dy; + } + destAddr -= xferFields * dy; + MPI_Isend(destAddr, xferFields * dy, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && colMin && doSend) + { + int toRank = myRank + domain.tp() - 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + destAddr[i] = (domain.*src)(dx * (dy - 1) + i * dx * dy); + } + destAddr += dz; + } + destAddr -= xferFields * dz; + MPI_Isend(destAddr, xferFields * dz, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && planeMax && doSend) + { + int toRank = myRank + domain.tp() * domain.tp() - domain.tp(); + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + destAddr[i] = (domain.*src)(dx * dy * (dz - 1) + i); + } + destAddr += dx; + } + destAddr -= xferFields * dx; + MPI_Isend(destAddr, xferFields * dx, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(colMin && planeMax && doSend) + { + int toRank = myRank + domain.tp() * domain.tp() - 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + destAddr[i] = (domain.*src)(dx * dy * (dz - 1) + i * dx); + } + destAddr += dy; + } + destAddr -= xferFields * dy; + MPI_Isend(destAddr, xferFields * dy, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && colMax) + { + int toRank = myRank - domain.tp() + 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + destAddr[i] = (domain.*src)(dx - 1 + i * dx * dy); + } + destAddr += dz; + } + destAddr -= xferFields * dz; + MPI_Isend(destAddr, xferFields * dz, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMax && planeMin) + { + int toRank = myRank - domain.tp() * domain.tp() + domain.tp(); + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + destAddr[i] = (domain.*src)(dx * (dy - 1) + i); + } + destAddr += dx; + } + destAddr -= xferFields * dx; + MPI_Isend(destAddr, xferFields * dx, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(colMax && planeMin) + { + int toRank = myRank - domain.tp() * domain.tp() + 1; + destAddr = &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member src = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + destAddr[i] = (domain.*src)(dx - 1 + i * dx); + } + destAddr += dy; + } + destAddr -= xferFields * dy; + MPI_Isend(destAddr, xferFields * dy, baseType, toRank, msgType, + MPI_COMM_WORLD, &domain.sendRequest[pmsg + emsg]); + ++emsg; + } + + if(rowMin && colMin && planeMin) + { + /* corner at domain logical coord (0, 0, 0) */ + int toRank = myRank - domain.tp() * domain.tp() - domain.tp() - 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(0); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMin && planeMax && doSend) + { + /* corner at domain logical coord (0, 0, 1) */ + int toRank = myRank + domain.tp() * domain.tp() - domain.tp() - 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMax && planeMin) + { + /* corner at domain logical coord (1, 0, 0) */ + int toRank = myRank - domain.tp() * domain.tp() - domain.tp() + 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx - 1; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMin && colMax && planeMax && doSend) + { + /* corner at domain logical coord (1, 0, 1) */ + int toRank = myRank + domain.tp() * domain.tp() - domain.tp() + 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + (dx - 1); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMin && planeMin) + { + /* corner at domain logical coord (0, 1, 0) */ + int toRank = myRank - domain.tp() * domain.tp() + domain.tp() - 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * (dy - 1); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMin && planeMax && doSend) + { + /* corner at domain logical coord (0, 1, 1) */ + int toRank = myRank + domain.tp() * domain.tp() + domain.tp() - 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + dx * (dy - 1); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMax && planeMin) + { + /* corner at domain logical coord (1, 1, 0) */ + int toRank = myRank - domain.tp() * domain.tp() + domain.tp() + 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy - 1; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + if(rowMax && colMax && planeMax && doSend) + { + /* corner at domain logical coord (1, 1, 1) */ + int toRank = myRank + domain.tp() * domain.tp() + domain.tp() + 1; + Real_t* comBuf = + &domain.commDataSend[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * dz - 1; + for(Index_t fi = 0; fi < xferFields; ++fi) + { + comBuf[fi] = (domain.*fieldData[fi])(idx); + } + MPI_Isend(comBuf, xferFields, baseType, toRank, msgType, MPI_COMM_WORLD, + &domain.sendRequest[pmsg + emsg + cmsg]); + ++cmsg; + } + } + + MPI_Waitall(26, domain.sendRequest, status); +} + +/******************************************/ + +void +CommSBN(Domain& domain, int xferFields, Domain_member* fieldData) +{ + if(domain.numRanks() == 1) + return; + + /* summation order should be from smallest value to largest */ + /* or we could try out kahan summation! */ + + int myRank; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize(); + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize(); + Index_t pmsg = 0; /* plane comm msg */ + Index_t emsg = 0; /* edge comm msg */ + Index_t cmsg = 0; /* corner comm msg */ + Index_t dx = domain.sizeX() + 1; + Index_t dy = domain.sizeY() + 1; + Index_t dz = domain.sizeZ() + 1; + MPI_Status status; + Real_t* srcAddr; + Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax; + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1; + if(domain.rowLoc() == 0) + { + rowMin = 0; + } + if(domain.rowLoc() == (domain.tp() - 1)) + { + rowMax = 0; + } + if(domain.colLoc() == 0) + { + colMin = 0; + } + if(domain.colLoc() == (domain.tp() - 1)) + { + colMax = 0; + } + if(domain.planeLoc() == 0) + { + planeMin = 0; + } + if(domain.planeLoc() == (domain.tp() - 1)) + { + planeMax = 0; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + + if(planeMin | planeMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dy; + + if(planeMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(i) += srcAddr[i]; + } + srcAddr += opCount; + } + ++pmsg; + } + if(planeMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i) += srcAddr[i]; + } + srcAddr += opCount; + } + ++pmsg; + } + } + + if(rowMin | rowMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dz; + + if(rowMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + (domain.*dest)(i * dx * dy + j) += srcAddr[i * dx + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + if(rowMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + (domain.*dest)(dx * (dy - 1) + i * dx * dy + j) += + srcAddr[i * dx + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + } + if(colMin | colMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dy * dz; + + if(colMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + (domain.*dest)(i * dx * dy + j * dx) += srcAddr[i * dy + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + if(colMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + (domain.*dest)(dx - 1 + i * dx * dy + j * dx) += + srcAddr[i * dy + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + } + + if(rowMin & colMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(i * dx * dy) += srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMin & planeMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(i) += srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMin & planeMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(i * dx) += srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMax & colMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx * dy - 1 + i * dx * dy) += srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMax & planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * (dy - 1) + dx * dy * (dz - 1) + i) += srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMax & planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + dx - 1 + i * dx) += srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMax & colMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx * (dy - 1) + i * dx * dy) += srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMin & planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i) += srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMin & planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i * dx) += srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMin & colMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx - 1 + i * dx * dy) += srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMax & planeMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * (dy - 1) + i) += srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMax & planeMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx - 1 + i * dx) += srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMin & colMin & planeMin) + { + /* corner at domain logical coord (0, 0, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(0) += comBuf[fi]; + } + ++cmsg; + } + if(rowMin & colMin & planeMax) + { + /* corner at domain logical coord (0, 0, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMin & colMax & planeMin) + { + /* corner at domain logical coord (1, 0, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMin & colMax & planeMax) + { + /* corner at domain logical coord (1, 0, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + (dx - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMax & colMin & planeMin) + { + /* corner at domain logical coord (0, 1, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * (dy - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMax & colMin & planeMax) + { + /* corner at domain logical coord (0, 1, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + dx * (dy - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMax & colMax & planeMin) + { + /* corner at domain logical coord (1, 1, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } + if(rowMax & colMax & planeMax) + { + /* corner at domain logical coord (1, 1, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * dz - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) += comBuf[fi]; + } + ++cmsg; + } +} + +/******************************************/ + +void +CommSyncPosVel(Domain& domain) +{ + if(domain.numRanks() == 1) + return; + + int myRank; + bool doRecv = false; + Index_t xferFields = 6; /* x, y, z, xd, yd, zd */ + Domain_member fieldData[6]; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize(); + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize(); + Index_t pmsg = 0; /* plane comm msg */ + Index_t emsg = 0; /* edge comm msg */ + Index_t cmsg = 0; /* corner comm msg */ + Index_t dx = domain.sizeX() + 1; + Index_t dy = domain.sizeY() + 1; + Index_t dz = domain.sizeZ() + 1; + MPI_Status status; + Real_t* srcAddr; + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax; + + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true; + if(domain.rowLoc() == 0) + { + rowMin = false; + } + if(domain.rowLoc() == (domain.tp() - 1)) + { + rowMax = false; + } + if(domain.colLoc() == 0) + { + colMin = false; + } + if(domain.colLoc() == (domain.tp() - 1)) + { + colMax = false; + } + if(domain.planeLoc() == 0) + { + planeMin = false; + } + if(domain.planeLoc() == (domain.tp() - 1)) + { + planeMax = false; + } + + fieldData[0] = &Domain::x; + fieldData[1] = &Domain::y; + fieldData[2] = &Domain::z; + fieldData[3] = &Domain::xd; + fieldData[4] = &Domain::yd; + fieldData[5] = &Domain::zd; + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + + if(planeMin | planeMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dy; + + if(planeMin && doRecv) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(i) = srcAddr[i]; + } + srcAddr += opCount; + } + ++pmsg; + } + if(planeMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i) = srcAddr[i]; + } + srcAddr += opCount; + } + ++pmsg; + } + } + + if(rowMin | rowMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dz; + + if(rowMin && doRecv) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + (domain.*dest)(i * dx * dy + j) = srcAddr[i * dx + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + if(rowMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dx; ++j) + { + (domain.*dest)(dx * (dy - 1) + i * dx * dy + j) = + srcAddr[i * dx + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + } + + if(colMin | colMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dy * dz; + + if(colMin && doRecv) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + (domain.*dest)(i * dx * dy + j * dx) = srcAddr[i * dy + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + if(colMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + for(Index_t j = 0; j < dy; ++j) + { + (domain.*dest)(dx - 1 + i * dx * dy + j * dx) = + srcAddr[i * dy + j]; + } + } + srcAddr += opCount; + } + ++pmsg; + } + } + + if(rowMin && colMin && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(i * dx * dy) = srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMin && planeMin && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(i) = srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMin && planeMin && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(i * dx) = srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMax && colMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx * dy - 1 + i * dx * dy) = srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMax && planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * (dy - 1) + dx * dy * (dz - 1) + i) = srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMax && planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + dx - 1 + i * dx) = srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMax && colMin) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx * (dy - 1) + i * dx * dy) = srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMin && planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i) = srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMin && planeMax) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx * dy * (dz - 1) + i * dx) = srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMin && colMax && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dz; ++i) + { + (domain.*dest)(dx - 1 + i * dx * dy) = srcAddr[i]; + } + srcAddr += dz; + } + ++emsg; + } + + if(rowMax && planeMin && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dx; ++i) + { + (domain.*dest)(dx * (dy - 1) + i) = srcAddr[i]; + } + srcAddr += dx; + } + ++emsg; + } + + if(colMax && planeMin && doRecv) + { + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm]; + MPI_Wait(&domain.recvRequest[pmsg + emsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < dy; ++i) + { + (domain.*dest)(dx - 1 + i * dx) = srcAddr[i]; + } + srcAddr += dy; + } + ++emsg; + } + + if(rowMin && colMin && planeMin && doRecv) + { + /* corner at domain logical coord (0, 0, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(0) = comBuf[fi]; + } + ++cmsg; + } + if(rowMin && colMin && planeMax) + { + /* corner at domain logical coord (0, 0, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMin && colMax && planeMin && doRecv) + { + /* corner at domain logical coord (1, 0, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMin && colMax && planeMax) + { + /* corner at domain logical coord (1, 0, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + (dx - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMax && colMin && planeMin && doRecv) + { + /* corner at domain logical coord (0, 1, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * (dy - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMax && colMin && planeMax) + { + /* corner at domain logical coord (0, 1, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * (dz - 1) + dx * (dy - 1); + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMax && colMax && planeMin && doRecv) + { + /* corner at domain logical coord (1, 1, 0) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } + if(rowMax && colMax && planeMax) + { + /* corner at domain logical coord (1, 1, 1) */ + Real_t* comBuf = &domain.commDataRecv[pmsg * maxPlaneComm + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL]; + Index_t idx = dx * dy * dz - 1; + MPI_Wait(&domain.recvRequest[pmsg + emsg + cmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + (domain.*fieldData[fi])(idx) = comBuf[fi]; + } + ++cmsg; + } +} + +/******************************************/ + +void +CommMonoQ(Domain& domain) +{ + if(domain.numRanks() == 1) + return; + + int myRank; + Index_t xferFields = 3; /* delv_xi, delv_eta, delv_zeta */ + Domain_member fieldData[3]; + Index_t fieldOffset[3]; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize(); + Index_t pmsg = 0; /* plane comm msg */ + Index_t dx = domain.sizeX(); + Index_t dy = domain.sizeY(); + Index_t dz = domain.sizeZ(); + MPI_Status status; + Real_t* srcAddr; + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax; + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true; + if(domain.rowLoc() == 0) + { + rowMin = false; + } + if(domain.rowLoc() == (domain.tp() - 1)) + { + rowMax = false; + } + if(domain.colLoc() == 0) + { + colMin = false; + } + if(domain.colLoc() == (domain.tp() - 1)) + { + colMax = false; + } + if(domain.planeLoc() == 0) + { + planeMin = false; + } + if(domain.planeLoc() == (domain.tp() - 1)) + { + planeMax = false; + } + + /* point into ghost data area */ + // fieldData[0] = &(domain.delv_xi(domain.numElem())) ; + // fieldData[1] = &(domain.delv_eta(domain.numElem())) ; + // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ; + fieldData[0] = &Domain::delv_xi; + fieldData[1] = &Domain::delv_eta; + fieldData[2] = &Domain::delv_zeta; + fieldOffset[0] = domain.numElem(); + fieldOffset[1] = domain.numElem(); + fieldOffset[2] = domain.numElem(); + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + + if(planeMin | planeMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dy; + + if(planeMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + fieldOffset[fi] += opCount; + } + ++pmsg; + } + if(planeMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + fieldOffset[fi] += opCount; + } + ++pmsg; + } + } + + if(rowMin | rowMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dx * dz; + + if(rowMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + fieldOffset[fi] += opCount; + } + ++pmsg; + } + if(rowMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + fieldOffset[fi] += opCount; + } + ++pmsg; + } + } + if(colMin | colMax) + { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + Index_t opCount = dy * dz; + + if(colMin) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + fieldOffset[fi] += opCount; + } + ++pmsg; + } + if(colMax) + { + /* contiguous memory */ + srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm]; + MPI_Wait(&domain.recvRequest[pmsg], &status); + for(Index_t fi = 0; fi < xferFields; ++fi) + { + Domain_member dest = fieldData[fi]; + for(Index_t i = 0; i < opCount; ++i) + { + (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i]; + } + srcAddr += opCount; + } + ++pmsg; + } + } +} + +#endif diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh-init.cc b/projects/rocprofiler-systems/examples/lulesh/lulesh-init.cc new file mode 100644 index 0000000000..d28478afee --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh-init.cc @@ -0,0 +1,886 @@ +#include +#if USE_MPI +# include +#endif +#if _OPENMP +# include +#endif +#include "lulesh.h" +#include +#include +#include +#include +#include + +static KOKKOS_INLINE_FUNCTION Real_t + CalcElemVolume(const Real_t x0, const Real_t x1, const Real_t x2, const Real_t x3, + const Real_t x4, const Real_t x5, const Real_t x6, const Real_t x7, + const Real_t y0, const Real_t y1, const Real_t y2, const Real_t y3, + const Real_t y4, const Real_t y5, const Real_t y6, const Real_t y7, + const Real_t z0, const Real_t z1, const Real_t z2, const Real_t z3, + const Real_t z4, const Real_t z5, const Real_t z6, const Real_t z7) +{ + Real_t twelveth = Real_t(1.0) / Real_t(12.0); + + Real_t dx61 = x6 - x1; + Real_t dy61 = y6 - y1; + Real_t dz61 = z6 - z1; + + Real_t dx70 = x7 - x0; + Real_t dy70 = y7 - y0; + Real_t dz70 = z7 - z0; + + Real_t dx63 = x6 - x3; + Real_t dy63 = y6 - y3; + Real_t dz63 = z6 - z3; + + Real_t dx20 = x2 - x0; + Real_t dy20 = y2 - y0; + Real_t dz20 = z2 - z0; + + Real_t dx50 = x5 - x0; + Real_t dy50 = y5 - y0; + Real_t dz50 = z5 - z0; + + Real_t dx64 = x6 - x4; + Real_t dy64 = y6 - y4; + Real_t dz64 = z6 - z4; + + Real_t dx31 = x3 - x1; + Real_t dy31 = y3 - y1; + Real_t dz31 = z3 - z1; + + Real_t dx72 = x7 - x2; + Real_t dy72 = y7 - y2; + Real_t dz72 = z7 - z2; + + Real_t dx43 = x4 - x3; + Real_t dy43 = y4 - y3; + Real_t dz43 = z4 - z3; + + Real_t dx57 = x5 - x7; + Real_t dy57 = y5 - y7; + Real_t dz57 = z5 - z7; + + Real_t dx14 = x1 - x4; + Real_t dy14 = y1 - y4; + Real_t dz14 = z1 - z4; + + Real_t dx25 = x2 - x5; + Real_t dy25 = y2 - y5; + Real_t dz25 = z2 - z5; + +#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \ + ((x1) * ((y2) * (z3) - (z2) * (y3)) + (x2) * ((z1) * (y3) - (y1) * (z3)) + \ + (x3) * ((y1) * (z2) - (z1) * (y2))) + + Real_t volume = TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20, dy31 + dy72, dy63, dy20, + dz31 + dz72, dz63, dz20) + + TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70, dy43 + dy57, dy64, dy70, + dz43 + dz57, dz64, dz70) + + TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50, dy14 + dy25, dy61, dy50, + dz14 + dz25, dz61, dz50); + +#undef TRIPLE_PRODUCT + + volume *= twelveth; + + return volume; +} + +/******************************************/ + +KOKKOS_INLINE_FUNCTION +Real_t +CalcElemVolume(const Real_t x[8], const Real_t y[8], const Real_t z[8]) +{ + return CalcElemVolume(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], y[0], y[1], + y[2], y[3], y[4], y[5], y[6], y[7], z[0], z[1], z[2], z[3], + z[4], z[5], z[6], z[7]); +} + +///////////////////////////////////////////////////////////////////// +Domain::Domain(Int_t numRanks, Index_t colLoc, Index_t rowLoc, Index_t planeLoc, + Index_t nx, int tp, int nr, int balance, Int_t cost) +: m_e_cut(Real_t(1.0e-7)) +, m_p_cut(Real_t(1.0e-7)) +, m_q_cut(Real_t(1.0e-7)) +, m_v_cut(Real_t(1.0e-10)) +, m_u_cut(Real_t(1.0e-7)) +, m_hgcoef(Real_t(3.0)) +, m_ss4o3(Real_t(4.0) / Real_t(3.0)) +, m_qstop(Real_t(1.0e+12)) +, m_monoq_max_slope(Real_t(1.0)) +, m_monoq_limiter_mult(Real_t(2.0)) +, m_qlc_monoq(Real_t(0.5)) +, m_qqc_monoq(Real_t(2.0) / Real_t(3.0)) +, m_qqc(Real_t(2.0)) +, m_eosvmax(Real_t(1.0e+9)) +, m_eosvmin(Real_t(1.0e-9)) +, m_pmin(Real_t(0.)) +, m_emin(Real_t(-1.0e+15)) +, m_dvovmax(Real_t(0.1)) +, m_refdens(Real_t(1.0)) +, + // + // set pointers to (potentially) "new'd" arrays to null to + // simplify deallocation. + // + m_regNumList(0) +, m_nodeElemStart(0) +, m_nodeElemCornerList(0) +, m_regElemSize(0) +, m_regElemlist(0) +#if USE_MPI +, commDataSend(0) +, commDataRecv(0) +#endif +{ + Index_t edgeElems = nx; + Index_t edgeNodes = edgeElems + 1; + this->cost() = cost; + + m_tp = tp; + m_numRanks = numRanks; + + /////////////////////////////// + // Initialize Sedov Mesh + /////////////////////////////// + + // construct a uniform box for this processor + + m_colLoc = colLoc; + m_rowLoc = rowLoc; + m_planeLoc = planeLoc; + + m_sizeX = edgeElems; + m_sizeY = edgeElems; + m_sizeZ = edgeElems; + m_numElem = edgeElems * edgeElems * edgeElems; + + m_numNode = edgeNodes * edgeNodes * edgeNodes; + + m_regNumList = Allocate(numElem()); // material indexset + + // Elem-centered + AllocateElemPersistent(numElem()); + + // Node-centered + AllocateNodePersistent(numNode()); + + SetupCommBuffers(edgeNodes); + + // Basic Field Initialization + for(Index_t i = 0; i < numElem(); ++i) + { + e(i) = Real_t(0.0); + p(i) = Real_t(0.0); + q(i) = Real_t(0.0); + ss(i) = Real_t(0.0); + } + + // Note - v initializes to 1.0, not 0.0! + for(Index_t i = 0; i < numElem(); ++i) + { + v(i) = Real_t(1.0); + } + + for(Index_t i = 0; i < numNode(); ++i) + { + xd(i) = Real_t(0.0); + yd(i) = Real_t(0.0); + zd(i) = Real_t(0.0); + } + + for(Index_t i = 0; i < numNode(); ++i) + { + xdd(i) = Real_t(0.0); + ydd(i) = Real_t(0.0); + zdd(i) = Real_t(0.0); + } + + for(Index_t i = 0; i < numNode(); ++i) + { + nodalMass(i) = Real_t(0.0); + } + + BuildMesh(nx, edgeNodes, edgeElems); + +#if _OPENMP + SetupThreadSupportStructures(); +#else + // These arrays are not used if we're not threaded + m_nodeElemStart = NULL; + m_nodeElemCornerList = NULL; +#endif + + // Setup region index sets. For now, these are constant sized + // throughout the run, but could be changed every cycle to + // simulate effects of ALE on the lagrange solver + CreateRegionIndexSets(nr, balance); + + // Setup symmetry nodesets + SetupSymmetryPlanes(edgeNodes); + + // Setup element connectivities + SetupElementConnectivities(edgeElems); + + // Setup symmetry planes and free surface boundary arrays + SetupBoundaryConditions(edgeElems); + + // Setup defaults + + // These can be changed (requires recompile) if you want to run + // with a fixed timestep, or to a different end time, but it's + // probably easier/better to just run a fixed number of timesteps + // using the -i flag in 2.x + + dtfixed() = Real_t(-1.0e-6); // Negative means use courant condition + stoptime() = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ; + + // Initial conditions + deltatimemultlb() = Real_t(1.1); + deltatimemultub() = Real_t(1.2); + dtcourant() = Real_t(1.0e+20); + dthydro() = Real_t(1.0e+20); + dtmax() = Real_t(1.0e-2); + time() = Real_t(0.); + cycle() = Int_t(0); + + // initialize field data + for(Index_t i = 0; i < numElem(); ++i) + { + Real_t x_local[8], y_local[8], z_local[8]; + Index_t* elemToNode = nodelist(i); + for(Index_t lnode = 0; lnode < 8; ++lnode) + { + Index_t gnode = elemToNode[lnode]; + x_local[lnode] = x(gnode); + y_local[lnode] = y(gnode); + z_local[lnode] = z(gnode); + } + + // volume calculations + Real_t volume = CalcElemVolume(x_local, y_local, z_local); + volo(i) = volume; + elemMass(i) = volume; + for(Index_t j = 0; j < 8; ++j) + { + Index_t idx = elemToNode[j]; + nodalMass(idx) += volume / Real_t(8.0); + } + } + + // deposit initial energy + // An energy of 3.948746e+7 is correct for a problem with + // 45 zones along a side - we need to scale it + const Real_t ebase = Real_t(3.948746e+7); + Real_t scale = (nx * m_tp) / Real_t(45.0); + Real_t einit = ebase * scale * scale * scale; + if(m_rowLoc + m_colLoc + m_planeLoc == 0) + { + // Dump into the first zone (which we know is in the corner) + // of the domain that sits at the origin + e(0) = einit; + } + // set initial deltatime base on analytic CFL calculation + deltatime() = (Real_t(.5) * cbrt(volo(0))) / sqrt(Real_t(2.0) * einit); + +} // End constructor + +//////////////////////////////////////////////////////////////////////////////// +Domain::~Domain() +{ + /* Release(&m_regNumList); + Release(&m_nodeElemStart); + Release(&m_nodeElemCornerList); + Release(&m_regElemSize); + for (Index_t i=0 ; i(numNode()); + + for(Index_t i = 0; i < numNode(); ++i) + { + nodeElemCount[i] = 0; + } + + for(Index_t i = 0; i < numElem(); ++i) + { + Index_t* nl = nodelist(i); + for(Index_t j = 0; j < 8; ++j) + { + ++(nodeElemCount[nl[j]]); + } + } + + m_nodeElemStart = Allocate(numNode() + 1); + + m_nodeElemStart[0] = 0; + + for(Index_t i = 1; i <= numNode(); ++i) + { + m_nodeElemStart[i] = m_nodeElemStart[i - 1] + nodeElemCount[i - 1]; + } + + m_nodeElemCornerList = Allocate(m_nodeElemStart[numNode()]); + + for(Index_t i = 0; i < numNode(); ++i) + { + nodeElemCount[i] = 0; + } + + for(Index_t i = 0; i < numElem(); ++i) + { + Index_t* nl = nodelist(i); + for(Index_t j = 0; j < 8; ++j) + { + Index_t m = nl[j]; + Index_t k = i * 8 + j; + Index_t offset = m_nodeElemStart[m] + nodeElemCount[m]; + m_nodeElemCornerList[offset] = k; + ++(nodeElemCount[m]); + } + } + + Index_t clSize = m_nodeElemStart[numNode()]; + for(Index_t i = 0; i < clSize; ++i) + { + Index_t clv = m_nodeElemCornerList[i]; + if((clv < 0) || (clv > numElem() * 8)) + { + fprintf( + stderr, + "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + } + + Release(&nodeElemCount); +} + +//////////////////////////////////////////////////////////////////////////////// +void +Domain::SetupCommBuffers(Int_t edgeNodes) +{ + // allocate a buffer large enough for nodal ghost data + Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ())) + 1; + m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize * maxEdgeSize); + m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize); + + // assume communication to 6 neighbors by default + m_rowMin = (m_rowLoc == 0) ? 0 : 1; + m_rowMax = (m_rowLoc == m_tp - 1) ? 0 : 1; + m_colMin = (m_colLoc == 0) ? 0 : 1; + m_colMax = (m_colLoc == m_tp - 1) ? 0 : 1; + m_planeMin = (m_planeLoc == 0) ? 0 : 1; + m_planeMax = (m_planeLoc == m_tp - 1) ? 0 : 1; + +#if USE_MPI + // account for face communication + Index_t comBufSize = + (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) * + m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM; + + // account for edge communication + comBufSize += + ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) + + (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) + + (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) + + (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) * + m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM; + + // account for corner communication + // factor of 16 is so each buffer has its own cache line + comBufSize += + ((m_rowMin & m_colMin & m_planeMin) + (m_rowMin & m_colMin & m_planeMax) + + (m_rowMin & m_colMax & m_planeMin) + (m_rowMin & m_colMax & m_planeMax) + + (m_rowMax & m_colMin & m_planeMin) + (m_rowMax & m_colMin & m_planeMax) + + (m_rowMax & m_colMax & m_planeMin) + (m_rowMax & m_colMax & m_planeMax)) * + CACHE_COHERENCE_PAD_REAL; + + this->commDataSend = Allocate(comBufSize); + this->commDataRecv = Allocate(comBufSize); + // prevent floating point exceptions + memset(this->commDataSend, 0, comBufSize * sizeof(Real_t)); + memset(this->commDataRecv, 0, comBufSize * sizeof(Real_t)); +#endif + + // Boundary nodesets + if(m_colLoc == 0) + m_symmX.resize(edgeNodes * edgeNodes); + if(m_rowLoc == 0) + m_symmY.resize(edgeNodes * edgeNodes); + if(m_planeLoc == 0) + m_symmZ.resize(edgeNodes * edgeNodes); +} + +//////////////////////////////////////////////////////////////////////////////// +void +Domain::CreateRegionIndexSets(Int_t nr, Int_t balance) +{ +#if USE_MPI + Index_t myRank; + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + srand(myRank); +#else + srand(0); + Index_t myRank = 0; +#endif + this->numReg() = nr; + m_regElemSize = Allocate(numReg()); + m_regElemlist = Allocate(numReg()); + Index_t nextIndex = 0; + // if we only have one region just fill it + // Fill out the regNumList with material numbers, which are always + // the region index plus one + if(numReg() == 1) + { + while(nextIndex < numElem()) + { + this->regNumList(nextIndex) = 1; + nextIndex++; + } + regElemSize(0) = 0; + } + // If we have more than one region distribute the elements. + else + { + Int_t regionNum; + Int_t regionVar; + Int_t lastReg = -1; + Int_t binSize; + Index_t elements; + Index_t runto = 0; + Int_t costDenominator = 0; + Int_t* regBinEnd = Allocate(numReg()); + // Determine the relative weights of all the regions. This is based off the -b + // flag. Balance is the value passed into b. + for(Index_t i = 0; i < numReg(); ++i) + { + regElemSize(i) = 0; + costDenominator += pow((i + 1), balance); // Total sum of all regions weights + regBinEnd[i] = + costDenominator; // Chance of hitting a given region is (regBinEnd[i] - + // regBinEdn[i-1])/costDenominator + } + // Until all elements are assigned + while(nextIndex < numElem()) + { + // pick the region + regionVar = rand() % costDenominator; + Index_t i = 0; + while(regionVar >= regBinEnd[i]) + i++; + // rotate the regions based on MPI rank. Rotation is Rank % NumRegions this + // makes each domain have a different region with the highest representation + regionNum = ((i + myRank) % numReg()) + 1; + // make sure we don't pick the same region twice in a row + while(regionNum == lastReg) + { + regionVar = rand() % costDenominator; + i = 0; + while(regionVar >= regBinEnd[i]) + i++; + regionNum = ((i + myRank) % numReg()) + 1; + } + // Pick the bin size of the region and determine the number of elements. + binSize = rand() % 1000; + if(binSize < 773) + { + elements = rand() % 15 + 1; + } + else if(binSize < 937) + { + elements = rand() % 16 + 16; + } + else if(binSize < 970) + { + elements = rand() % 32 + 32; + } + else if(binSize < 974) + { + elements = rand() % 64 + 64; + } + else if(binSize < 978) + { + elements = rand() % 128 + 128; + } + else if(binSize < 981) + { + elements = rand() % 256 + 256; + } + else + elements = rand() % 1537 + 512; + runto = elements + nextIndex; + // Store the elements. If we hit the end before we run out of elements then + // just stop. + while(nextIndex < runto && nextIndex < numElem()) + { + this->regNumList(nextIndex) = regionNum; + nextIndex++; + } + lastReg = regionNum; + } + } + // Convert regNumList to region index sets + // First, count size of each region + for(Index_t i = 0; i < numElem(); ++i) + { + int r = this->regNumList(i) - 1; // region index == regnum-1 + regElemSize(r)++; + } + // Second, allocate each region index set + for(Index_t i = 0; i < numReg(); ++i) + { + m_regElemlist[i] = Allocate(regElemSize(i)); + regElemSize(i) = 0; + } + // Third, fill index sets + for(Index_t i = 0; i < numElem(); ++i) + { + Index_t r = regNumList(i) - 1; // region index == regnum-1 + Index_t regndx = regElemSize(r)++; // Note increment + regElemlist(r, regndx) = i; + } +} + +///////////////////////////////////////////////////////////// +void +Domain::SetupSymmetryPlanes(Int_t edgeNodes) +{ + Index_t nidx = 0; + for(Index_t i = 0; i < edgeNodes; ++i) + { + Index_t planeInc = i * edgeNodes * edgeNodes; + Index_t rowInc = i * edgeNodes; + for(Index_t j = 0; j < edgeNodes; ++j) + { + if(m_planeLoc == 0) + { + m_symmZ[nidx] = rowInc + j; + } + if(m_rowLoc == 0) + { + m_symmY[nidx] = planeInc + j; + } + if(m_colLoc == 0) + { + m_symmX[nidx] = planeInc + j * edgeNodes; + } + ++nidx; + } + } +} + +///////////////////////////////////////////////////////////// +void +Domain::SetupElementConnectivities(Int_t edgeElems) +{ + lxim(0) = 0; + for(Index_t i = 1; i < numElem(); ++i) + { + lxim(i) = i - 1; + lxip(i - 1) = i; + } + lxip(numElem() - 1) = numElem() - 1; + + for(Index_t i = 0; i < edgeElems; ++i) + { + letam(i) = i; + letap(numElem() - edgeElems + i) = numElem() - edgeElems + i; + } + for(Index_t i = edgeElems; i < numElem(); ++i) + { + letam(i) = i - edgeElems; + letap(i - edgeElems) = i; + } + + for(Index_t i = 0; i < edgeElems * edgeElems; ++i) + { + lzetam(i) = i; + lzetap(numElem() - edgeElems * edgeElems + i) = + numElem() - edgeElems * edgeElems + i; + } + for(Index_t i = edgeElems * edgeElems; i < numElem(); ++i) + { + lzetam(i) = i - edgeElems * edgeElems; + lzetap(i - edgeElems * edgeElems) = i; + } +} + +///////////////////////////////////////////////////////////// +void +Domain::SetupBoundaryConditions(Int_t edgeElems) +{ + Index_t ghostIdx[6]; // offsets to ghost locations + + // set up boundary condition information + for(Index_t i = 0; i < numElem(); ++i) + { + elemBC(i) = Int_t(0); + } + + for(Index_t i = 0; i < 6; ++i) + { + ghostIdx[i] = INT_MIN; + } + + Int_t pidx = numElem(); + if(m_planeMin != 0) + { + ghostIdx[0] = pidx; + pidx += sizeX() * sizeY(); + } + + if(m_planeMax != 0) + { + ghostIdx[1] = pidx; + pidx += sizeX() * sizeY(); + } + + if(m_rowMin != 0) + { + ghostIdx[2] = pidx; + pidx += sizeX() * sizeZ(); + } + + if(m_rowMax != 0) + { + ghostIdx[3] = pidx; + pidx += sizeX() * sizeZ(); + } + + if(m_colMin != 0) + { + ghostIdx[4] = pidx; + pidx += sizeY() * sizeZ(); + } + + if(m_colMax != 0) + { + ghostIdx[5] = pidx; + } + + // symmetry plane or free surface BCs + for(Index_t i = 0; i < edgeElems; ++i) + { + Index_t planeInc = i * edgeElems * edgeElems; + Index_t rowInc = i * edgeElems; + for(Index_t j = 0; j < edgeElems; ++j) + { + if(m_planeLoc == 0) + { + elemBC(rowInc + j) |= ZETA_M_SYMM; + } + else + { + elemBC(rowInc + j) |= ZETA_M_COMM; + lzetam(rowInc + j) = ghostIdx[0] + rowInc + j; + } + + if(m_planeLoc == m_tp - 1) + { + elemBC(rowInc + j + numElem() - edgeElems * edgeElems) |= ZETA_P_FREE; + } + else + { + elemBC(rowInc + j + numElem() - edgeElems * edgeElems) |= ZETA_P_COMM; + lzetap(rowInc + j + numElem() - edgeElems * edgeElems) = + ghostIdx[1] + rowInc + j; + } + + if(m_rowLoc == 0) + { + elemBC(planeInc + j) |= ETA_M_SYMM; + } + else + { + elemBC(planeInc + j) |= ETA_M_COMM; + letam(planeInc + j) = ghostIdx[2] + rowInc + j; + } + + if(m_rowLoc == m_tp - 1) + { + elemBC(planeInc + j + edgeElems * edgeElems - edgeElems) |= ETA_P_FREE; + } + else + { + elemBC(planeInc + j + edgeElems * edgeElems - edgeElems) |= ETA_P_COMM; + letap(planeInc + j + edgeElems * edgeElems - edgeElems) = + ghostIdx[3] + rowInc + j; + } + + if(m_colLoc == 0) + { + elemBC(planeInc + j * edgeElems) |= XI_M_SYMM; + } + else + { + elemBC(planeInc + j * edgeElems) |= XI_M_COMM; + lxim(planeInc + j * edgeElems) = ghostIdx[4] + rowInc + j; + } + + if(m_colLoc == m_tp - 1) + { + elemBC(planeInc + j * edgeElems + edgeElems - 1) |= XI_P_FREE; + } + else + { + elemBC(planeInc + j * edgeElems + edgeElems - 1) |= XI_P_COMM; + lxip(planeInc + j * edgeElems + edgeElems - 1) = ghostIdx[5] + rowInc + j; + } + } + } +} + +/////////////////////////////////////////////////////////////////////////// +void +InitMeshDecomp(Int_t numRanks, Int_t myRank, Int_t* col, Int_t* row, Int_t* plane, + Int_t* side) +{ + Int_t testProcs; + Int_t dx, dy, dz; + Int_t myDom; + + // Assume cube processor layout for now + testProcs = Int_t(cbrt(Real_t(numRanks)) + 0.5); + if(testProcs * testProcs * testProcs != numRanks) + { + printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + if(sizeof(Real_t) != 4 && sizeof(Real_t) != 8) + { + printf("MPI operations only support float and double right now...\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + if(MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) + { + printf("corner element comm buffers too small. Fix code.\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + + dx = testProcs; + dy = testProcs; + dz = testProcs; + + // temporary test + if(dx * dy * dz != numRanks) + { + printf("error -- must have as many domains as procs\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + Int_t remainder = dx * dy * dz % numRanks; + if(myRank < remainder) + { + myDom = myRank * (1 + (dx * dy * dz / numRanks)); + } + else + { + myDom = remainder * (1 + (dx * dy * dz / numRanks)) + + (myRank - remainder) * (dx * dy * dz / numRanks); + } + + *col = myDom % dx; + *row = (myDom / dx) % dy; + *plane = myDom / (dx * dy); + *side = testProcs; + + return; +} diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh-util.cc b/projects/rocprofiler-systems/examples/lulesh/lulesh-util.cc new file mode 100644 index 0000000000..32bb71e5fd --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh-util.cc @@ -0,0 +1,273 @@ +#include +#include +#include +#include +#if USE_MPI +# include +#endif +#include "lulesh.h" + +/* Helper function for converting strings to ints, with error checking */ +int +StrToInt(const char* token, int* retVal) +{ + const char* c; + char* endptr; + const int decimal_base = 10; + + if(token == NULL) + return 0; + + c = token; + *retVal = (int) strtol(c, &endptr, decimal_base); + if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0'))) + return 1; + else + return 0; +} + +static void +PrintCommandLineOptions(char* execname, int myRank) +{ + if(myRank == 0) + { + printf("Usage: %s [opts]\n", execname); + printf(" where [opts] is one or more of:\n"); + printf(" -q : quiet mode - suppress all stdout\n"); + printf(" -i : number of cycles to run\n"); + printf(" -s : length of cube mesh along side\n"); + printf(" -r : Number of distinct regions (def: 11)\n"); + printf(" -b : Load balance between regions of a domain (def: 1)\n"); + printf(" -c : Extra cost of more expensive regions (def: 1)\n"); + printf(" -f : Number of files to split viz dump into (def: " + "(np+10)/9)\n"); + printf(" -p : Print out progress\n"); + printf( + " -v : Output viz file (requires compiling with -DVIZ_MESH\n"); + printf(" -h : This message\n"); + printf("\n\n"); + } +} + +static void +ParseError(const char* message, int myRank) +{ + if(myRank == 0) + { + printf("%s\n", message); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } +} + +void +ParseCommandLineOptions(int argc, char* argv[], int myRank, struct cmdLineOpts* opts) +{ + if(argc > 1) + { + int i = 1; + + while(i < argc) + { + int ok; + /* -i */ + if(strcmp(argv[i], "-i") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -i", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->its)); + if(!ok) + { + ParseError("Parse Error on option -i integer value required after " + "argument\n", + myRank); + } + i += 2; + } + /* -s */ + else if(strcmp(argv[i], "-s") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -s\n", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->nx)); + if(!ok) + { + ParseError("Parse Error on option -s integer value required after " + "argument\n", + myRank); + } + i += 2; + } + /* -r */ + else if(strcmp(argv[i], "-r") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -r\n", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->numReg)); + if(!ok) + { + ParseError("Parse Error on option -r integer value required after " + "argument\n", + myRank); + } + i += 2; + } + /* -f */ + else if(strcmp(argv[i], "-f") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -f\n", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->numFiles)); + if(!ok) + { + ParseError("Parse Error on option -f integer value required after " + "argument\n", + myRank); + } + i += 2; + } + /* -p */ + else if(strcmp(argv[i], "-p") == 0) + { + opts->showProg = 1; + i++; + } + /* -q */ + else if(strcmp(argv[i], "-q") == 0) + { + opts->quiet = 1; + i++; + } + /* -q */ + else if(strcmp(argv[i], "-a") == 0) + { + opts->do_atomic = 1; + i++; + } + else if(strcmp(argv[i], "-b") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -b\n", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->balance)); + if(!ok) + { + ParseError("Parse Error on option -b integer value required after " + "argument\n", + myRank); + } + i += 2; + } + else if(strcmp(argv[i], "-c") == 0) + { + if(i + 1 >= argc) + { + ParseError("Missing integer argument to -c\n", myRank); + } + ok = StrToInt(argv[i + 1], &(opts->cost)); + if(!ok) + { + ParseError("Parse Error on option -c integer value required after " + "argument\n", + myRank); + } + i += 2; + } + /* -v */ + else if(strcmp(argv[i], "-v") == 0) + { +#if VIZ_MESH + opts->viz = 1; +#else + ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank); +#endif + i++; + } + /* -h */ + else if(strcmp(argv[i], "-h") == 0) + { + PrintCommandLineOptions(argv[0], myRank); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, 0); +#else + exit(0); +#endif + } + else + { + char msg[80]; + PrintCommandLineOptions(argv[0], myRank); + sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]); + ParseError(msg, myRank); + } + } + } +} + +///////////////////////////////////////////////////////////////////// + +void +VerifyAndWriteFinalOutput(Real_t elapsed_time, Domain& locDom, Int_t nx, Int_t numRanks) +{ + // GrindTime1 only takes a single domain into account, and is thus a good way to + // measure processor speed indepdendent of MPI parallelism. GrindTime2 takes into + // account speedups from MPI parallelism + Real_t grindTime1 = ((elapsed_time * 1e6) / locDom.cycle()) / (nx * nx * nx); + Real_t grindTime2 = + ((elapsed_time * 1e6) / locDom.cycle()) / (nx * nx * nx * numRanks); + + Index_t ElemId = 0; + printf("Run completed: \n"); + printf(" Problem size = %i \n", nx); + printf(" MPI tasks = %i \n", numRanks); + printf(" Iteration count = %i \n", locDom.cycle()); + printf(" Final Origin Energy = %12.6e \n", locDom.e(ElemId)); + + Real_t MaxAbsDiff = Real_t(0.0); + Real_t TotalAbsDiff = Real_t(0.0); + Real_t MaxRelDiff = Real_t(0.0); + + for(Index_t j = 0; j < nx; ++j) + { + for(Index_t k = j + 1; k < nx; ++k) + { + Real_t AbsDiff = FABS(locDom.e(j * nx + k) - locDom.e(k * nx + j)); + TotalAbsDiff += AbsDiff; + + if(MaxAbsDiff < AbsDiff) + MaxAbsDiff = AbsDiff; + + Real_t RelDiff = AbsDiff / locDom.e(k * nx + j); + + if(MaxRelDiff < RelDiff) + MaxRelDiff = RelDiff; + } + } + + // Quick symmetry check + printf(" Testing Plane 0 of Energy Array on rank 0:\n"); + printf(" MaxAbsDiff = %12.6e\n", MaxAbsDiff); + printf(" TotalAbsDiff = %12.6e\n", TotalAbsDiff); + printf(" MaxRelDiff = %12.6e\n\n", MaxRelDiff); + + // Timing information + printf("\nElapsed time = %10.2f (s)\n", elapsed_time); + printf("Grind time (us/z/c) = %10.8g (per dom) (%10.8g overall)\n", grindTime1, + grindTime2); + printf("FOM = %10.8g (z/s)\n\n", + 1000.0 / grindTime2); // zones per second + + return; +} diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh-viz.cc b/projects/rocprofiler-systems/examples/lulesh/lulesh-viz.cc new file mode 100644 index 0000000000..c0499f926f --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh-viz.cc @@ -0,0 +1,422 @@ +#include "lulesh.h" +#include +#include +#include +#include + +#ifdef VIZ_MESH + +# ifdef __cplusplus +extern "C" +{ +# endif +# include "silo.h" +# if USE_MPI +# include "pmpio.h" +# endif +# ifdef __cplusplus +} +# endif + +// Function prototypes +static void +DumpDomainToVisit(DBfile* db, Domain& domain, int myRank); +static + +# if USE_MPI + // For some reason, earlier versions of g++ (e.g. 4.2) won't let me + // put the 'static' qualifier on this prototype, even if it's done + // consistently in the prototype and definition + void + DumpMultiblockObjects(DBfile* db, PMPIO_baton_t* bat, char basename[], int numRanks); + +// Callback prototypes for PMPIO interface (only useful if we're +// running parallel) +static void* +LULESH_PMPIO_Create(const char* fname, const char* dname, void* udata); +static void* +LULESH_PMPIO_Open(const char* fname, const char* dname, PMPIO_iomode_t ioMode, + void* udata); +static void +LULESH_PMPIO_Close(void* file, void* udata); + +# else + void + DumpMultiblockObjects(DBfile* db, char basename[], int numRanks); +# endif + +/**********************************************************************/ +void +DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) +{ + char subdirName[32]; + char basename[32]; + DBfile* db; + + sprintf(basename, "lulesh_plot_c%d", domain.cycle()); + sprintf(subdirName, "data_%d", myRank); + +# if USE_MPI + + PMPIO_baton_t* bat = + PMPIO_Init(numFiles, PMPIO_WRITE, MPI_COMM_WORLD, 10101, LULESH_PMPIO_Create, + LULESH_PMPIO_Open, LULESH_PMPIO_Close, NULL); + + int myiorank = PMPIO_GroupRank(bat, myRank); + + char fileName[64]; + + if(myiorank == 0) + strcpy(fileName, basename); + else + sprintf(fileName, "%s.%03d", basename, myiorank); + + db = (DBfile*) PMPIO_WaitForBaton(bat, fileName, subdirName); + + DumpDomainToVisit(db, domain, myRank); + + // Processor 0 writes out bit of extra data to its file that + // describes how to stitch all the pieces together + if(myRank == 0) + { + DumpMultiblockObjects(db, bat, basename, numRanks); + } + + PMPIO_HandOffBaton(bat, db); + + PMPIO_Finish(bat); +# else + + db = (DBfile*) DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X); + + if(db) + { + DBMkDir(db, subdirName); + DBSetDir(db, subdirName); + DumpDomainToVisit(db, domain, myRank); + DumpMultiblockObjects(db, basename, numRanks); + } + else + { + printf("Error writing out viz file - rank %d\n", myRank); + } + +# endif +} + +/**********************************************************************/ + +static void +DumpDomainToVisit(DBfile* db, Domain& domain, int myRank) +{ + int ok = 0; + + /* Create an option list that will give some hints to VisIt for + * printing out the cycle and time in the annotations */ + DBoptlist* optlist; + + /* Write out the mesh connectivity in fully unstructured format */ + int shapetype[1] = { DB_ZONETYPE_HEX }; + int shapesize[1] = { 8 }; + int shapecnt[1] = { domain.numElem() }; + int* conn = Allocate(domain.numElem() * 8); + int ci = 0; + for(int ei = 0; ei < domain.numElem(); ++ei) + { + Index_t* elemToNode = domain.nodelist(ei); + for(int ni = 0; ni < 8; ++ni) + { + conn[ci++] = elemToNode[ni]; + } + } + ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3, conn, + domain.numElem() * 8, 0, 0, 0, /* Not carrying ghost zones */ + shapetype, shapesize, shapecnt, 1, NULL); + Release(&conn); + + /* Write out the mesh coordinates associated with the mesh */ + const char* coordnames[3] = { "X", "Y", "Z" }; + float* coords[3]; + coords[0] = Allocate(domain.numNode()); + coords[1] = Allocate(domain.numNode()); + coords[2] = Allocate(domain.numNode()); + for(int ni = 0; ni < domain.numNode(); ++ni) + { + coords[0][ni] = float(domain.x(ni)); + coords[1][ni] = float(domain.y(ni)); + coords[2][ni] = float(domain.z(ni)); + } + optlist = DBMakeOptlist(2); + ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time()); + ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle()); + ok += DBPutUcdmesh(db, "mesh", 3, (char**) &coordnames[0], (float**) coords, + domain.numNode(), domain.numElem(), "connectivity", 0, DB_FLOAT, + optlist); + ok += DBFreeOptlist(optlist); + Release(&coords[2]); + Release(&coords[1]); + Release(&coords[0]); + + /* Write out the materials */ + int* matnums = Allocate(domain.numReg()); + int dims[1] = { domain.numElem() }; // No mixed elements + for(int i = 0; i < domain.numReg(); ++i) + matnums[i] = i + 1; + + ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(), matnums, + domain.regNumList(), dims, 1, NULL, NULL, NULL, NULL, 0, DB_FLOAT, + NULL); + Release(&matnums); + + /* Write out pressure, energy, relvol, q */ + + float* e = Allocate(domain.numElem()); + for(int ei = 0; ei < domain.numElem(); ++ei) + { + e[ei] = float(domain.e(ei)); + } + ok += DBPutUcdvar1(db, "e", "mesh", e, domain.numElem(), NULL, 0, DB_FLOAT, + DB_ZONECENT, NULL); + Release(&e); + + float* p = Allocate(domain.numElem()); + for(int ei = 0; ei < domain.numElem(); ++ei) + { + p[ei] = float(domain.p(ei)); + } + ok += DBPutUcdvar1(db, "p", "mesh", p, domain.numElem(), NULL, 0, DB_FLOAT, + DB_ZONECENT, NULL); + Release(&p); + + float* v = Allocate(domain.numElem()); + for(int ei = 0; ei < domain.numElem(); ++ei) + { + v[ei] = float(domain.v(ei)); + } + ok += DBPutUcdvar1(db, "v", "mesh", v, domain.numElem(), NULL, 0, DB_FLOAT, + DB_ZONECENT, NULL); + Release(&v); + + float* q = Allocate(domain.numElem()); + for(int ei = 0; ei < domain.numElem(); ++ei) + { + q[ei] = float(domain.q(ei)); + } + ok += DBPutUcdvar1(db, "q", "mesh", q, domain.numElem(), NULL, 0, DB_FLOAT, + DB_ZONECENT, NULL); + Release(&q); + + /* Write out nodal speed, velocities */ + float* zd = Allocate(domain.numNode()); + float* yd = Allocate(domain.numNode()); + float* xd = Allocate(domain.numNode()); + float* speed = Allocate(domain.numNode()); + for(int ni = 0; ni < domain.numNode(); ++ni) + { + xd[ni] = float(domain.xd(ni)); + yd[ni] = float(domain.yd(ni)); + zd[ni] = float(domain.zd(ni)); + speed[ni] = + float(sqrt((xd[ni] * xd[ni]) + (yd[ni] * yd[ni]) + (zd[ni] * zd[ni]))); + } + + ok += DBPutUcdvar1(db, "speed", "mesh", speed, domain.numNode(), NULL, 0, DB_FLOAT, + DB_NODECENT, NULL); + Release(&speed); + + ok += DBPutUcdvar1(db, "xd", "mesh", xd, domain.numNode(), NULL, 0, DB_FLOAT, + DB_NODECENT, NULL); + Release(&xd); + + ok += DBPutUcdvar1(db, "yd", "mesh", yd, domain.numNode(), NULL, 0, DB_FLOAT, + DB_NODECENT, NULL); + Release(&yd); + + ok += DBPutUcdvar1(db, "zd", "mesh", zd, domain.numNode(), NULL, 0, DB_FLOAT, + DB_NODECENT, NULL); + Release(&zd); + + if(ok != 0) + { + printf("Error writing out viz file - rank %d\n", myRank); + } +} + +/**********************************************************************/ + +# if USE_MPI +void +DumpMultiblockObjects(DBfile* db, PMPIO_baton_t* bat, char basename[], int numRanks) +# else +void +DumpMultiblockObjects(DBfile* db, char basename[], int numRanks) +# endif +{ + /* MULTIBLOCK objects to tie together multiple files */ + char** multimeshObjs; + char** multimatObjs; + char*** multivarObjs; + int* blockTypes; + int* varTypes; + int ok = 0; + // Make sure this list matches what's written out above + char vars[][10] = { "p", "e", "v", "q", "speed", "xd", "yd", "zd" }; + int numvars = sizeof(vars) / sizeof(vars[0]); + + // Reset to the root directory of the silo file + DBSetDir(db, "/"); + + // Allocate a bunch of space for building up the string names + multimeshObjs = Allocate(numRanks); + multimatObjs = Allocate(numRanks); + multivarObjs = Allocate(numvars); + blockTypes = Allocate(numRanks); + varTypes = Allocate(numRanks); + + for(int v = 0; v < numvars; ++v) + { + multivarObjs[v] = Allocate(numRanks); + } + + for(int i = 0; i < numRanks; ++i) + { + multimeshObjs[i] = Allocate(64); + multimatObjs[i] = Allocate(64); + for(int v = 0; v < numvars; ++v) + { + multivarObjs[v][i] = Allocate(64); + } + blockTypes[i] = DB_UCDMESH; + varTypes[i] = DB_UCDVAR; + } + + // Build up the multiobject names + for(int i = 0; i < numRanks; ++i) + { +# if USE_MPI + int iorank = PMPIO_GroupRank(bat, i); +# else + int iorank = 0; +# endif + + // delete multivarObjs[i]; + if(iorank == 0) + { + snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i); + snprintf(multimatObjs[i], 64, "/data_%d/regions", i); + for(int v = 0; v < numvars; ++v) + { + snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]); + } + } + else + { + snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh", basename, iorank, i); + snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", basename, iorank, + i); + for(int v = 0; v < numvars; ++v) + { + snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", basename, iorank, + i, vars[v]); + } + } + } + + // Now write out the objects + ok += DBPutMultimesh(db, "mesh", numRanks, (char**) multimeshObjs, blockTypes, NULL); + ok += DBPutMultimat(db, "regions", numRanks, (char**) multimatObjs, NULL); + for(int v = 0; v < numvars; ++v) + { + ok += DBPutMultivar(db, vars[v], numRanks, (char**) multivarObjs[v], varTypes, + NULL); + } + + for(int v = 0; v < numvars; ++v) + { + for(int i = 0; i < numRanks; i++) + { + Release(&multivarObjs[v][i]); + } + Release(&multivarObjs[v]); + } + + // Clean up + for(int i = 0; i < numRanks; i++) + { + Release(&multimeshObjs[i]); + Release(&multimatObjs[i]); + } + Release(&multimeshObjs); + Release(&multimatObjs); + Release(&multivarObjs); + Release(&blockTypes); + Release(&varTypes); + + if(ok != 0) + { + printf("Error writing out multiXXX objs to viz file - rank 0\n"); + } +} + +# if USE_MPI + +/**********************************************************************/ + +static void* +LULESH_PMPIO_Create(const char* fname, const char* dname, void* udata) +{ + /* Create the file */ + DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X); + + /* Put the data in a subdirectory, so VisIt only sees the multimesh + * objects we write out in the base file */ + if(db) + { + DBMkDir(db, dname); + DBSetDir(db, dname); + } + return (void*) db; +} + +/**********************************************************************/ + +static void* +LULESH_PMPIO_Open(const char* fname, const char* dname, PMPIO_iomode_t ioMode, + void* udata) +{ + /* Open the file */ + DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND); + + /* Put the data in a subdirectory, so VisIt only sees the multimesh + * objects we write out in the base file */ + if(db) + { + DBMkDir(db, dname); + DBSetDir(db, dname); + } + return (void*) db; +} + +/**********************************************************************/ + +static void +LULESH_PMPIO_Close(void* file, void* udata) +{ + DBfile* db = (DBfile*) file; + if(db) + DBClose(db); +} +# endif + +#else + +void +DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) +{ + if(myRank == 0) + { + printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n"); + } +} + +#endif diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh.cc b/projects/rocprofiler-systems/examples/lulesh/lulesh.cc new file mode 100644 index 0000000000..ae35ef70a5 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh.cc @@ -0,0 +1,2311 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lulesh.h" + +static Real_t* buffer; +static size_t buffer_size; +static size_t buffer_offset; +static int do_atomic; + +void +ResizeBuffer(const size_t size) +{ + buffer_offset = 0; + if(size / sizeof(Real_t) + 1 > buffer_size) + { + buffer_size = size / sizeof(Real_t) + 1; + Release(&buffer); + buffer = Allocate(buffer_size); + } +} + +template +Type* +AllocateFromBuffer(const Index_t& count) +{ + const Index_t offset = (count * sizeof(Type) + sizeof(Real_t) - 1) / sizeof(Real_t); + Real_t* ptr = buffer + buffer_offset; + buffer_offset += ((offset + 511) / 512) * 512; + return static_cast(ptr); +} + +static inline void +TimeIncrement(Domain& domain) +{ + Real_t targetdt = domain.stoptime() - domain.time(); + + if((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) + { + Real_t ratio; + Real_t olddt = domain.deltatime(); + + Real_t gnewdt = Real_t(1.0e+20); + Real_t newdt; + if(domain.dtcourant() < gnewdt) + { + gnewdt = domain.dtcourant() / Real_t(2.0); + } + if(domain.dthydro() < gnewdt) + { + gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0); + } + +#if USE_MPI + MPI_Allreduce(&gnewdt, &newdt, 1, + ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE), MPI_MIN, + MPI_COMM_WORLD); +#else + newdt = gnewdt; +#endif + + ratio = newdt / olddt; + if(ratio >= Real_t(1.0)) + { + if(ratio < domain.deltatimemultlb()) + { + newdt = olddt; + } + else if(ratio > domain.deltatimemultub()) + { + newdt = olddt * domain.deltatimemultub(); + } + } + + if(newdt > domain.dtmax()) + { + newdt = domain.dtmax(); + } + domain.deltatime() = newdt; + } + + if((targetdt > domain.deltatime()) && + (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0)))) + { + targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0); + } + + if(targetdt < domain.deltatime()) + { + domain.deltatime() = targetdt; + } + + domain.time() += domain.deltatime(); + + ++domain.cycle(); +} + +KOKKOS_INLINE_FUNCTION void +CollectDomainNodesToElemNodes(const Domain& domain, const Index_t* elemToNode, + Real_t elemX[8], Real_t elemY[8], Real_t elemZ[8]) +{ + Index_t nd0i = elemToNode[0]; + Index_t nd1i = elemToNode[1]; + Index_t nd2i = elemToNode[2]; + Index_t nd3i = elemToNode[3]; + Index_t nd4i = elemToNode[4]; + Index_t nd5i = elemToNode[5]; + Index_t nd6i = elemToNode[6]; + Index_t nd7i = elemToNode[7]; + + elemX[0] = domain.c_x(nd0i); + elemX[1] = domain.c_x(nd1i); + elemX[2] = domain.c_x(nd2i); + elemX[3] = domain.c_x(nd3i); + elemX[4] = domain.c_x(nd4i); + elemX[5] = domain.c_x(nd5i); + elemX[6] = domain.c_x(nd6i); + elemX[7] = domain.c_x(nd7i); + + elemY[0] = domain.c_y(nd0i); + elemY[1] = domain.c_y(nd1i); + elemY[2] = domain.c_y(nd2i); + elemY[3] = domain.c_y(nd3i); + elemY[4] = domain.c_y(nd4i); + elemY[5] = domain.c_y(nd5i); + elemY[6] = domain.c_y(nd6i); + elemY[7] = domain.c_y(nd7i); + + elemZ[0] = domain.c_z(nd0i); + elemZ[1] = domain.c_z(nd1i); + elemZ[2] = domain.c_z(nd2i); + elemZ[3] = domain.c_z(nd3i); + elemZ[4] = domain.c_z(nd4i); + elemZ[5] = domain.c_z(nd5i); + elemZ[6] = domain.c_z(nd6i); + elemZ[7] = domain.c_z(nd7i); +} + +static inline void +InitStressTermsForElems(Domain& domain, Real_t* sigxx, Real_t* sigyy, Real_t* sigzz, + Index_t numElem) +{ + Kokkos::parallel_for("InitStressTermsForElems", numElem, + KOKKOS_LAMBDA(const Index_t& i) { + sigxx[i] = sigyy[i] = sigzz[i] = -domain.p(i) - domain.q(i); + }); +} + +static inline void +CalcElemShapeFunctionDerivatives(Real_t const x[], Real_t const y[], Real_t const z[], + Real_t b[][8], Real_t* const volume) +{ + const Real_t x0 = x[0]; + const Real_t x1 = x[1]; + const Real_t x2 = x[2]; + const Real_t x3 = x[3]; + const Real_t x4 = x[4]; + const Real_t x5 = x[5]; + const Real_t x6 = x[6]; + const Real_t x7 = x[7]; + + const Real_t y0 = y[0]; + const Real_t y1 = y[1]; + const Real_t y2 = y[2]; + const Real_t y3 = y[3]; + const Real_t y4 = y[4]; + const Real_t y5 = y[5]; + const Real_t y6 = y[6]; + const Real_t y7 = y[7]; + + const Real_t z0 = z[0]; + const Real_t z1 = z[1]; + const Real_t z2 = z[2]; + const Real_t z3 = z[3]; + const Real_t z4 = z[4]; + const Real_t z5 = z[5]; + const Real_t z6 = z[6]; + const Real_t z7 = z[7]; + + Real_t fjxxi, fjxet, fjxze; + Real_t fjyxi, fjyet, fjyze; + Real_t fjzxi, fjzet, fjzze; + Real_t cjxxi, cjxet, cjxze; + Real_t cjyxi, cjyet, cjyze; + Real_t cjzxi, cjzet, cjzze; + + fjxxi = Real_t(.125) * ((x6 - x0) + (x5 - x3) - (x7 - x1) - (x4 - x2)); + fjxet = Real_t(.125) * ((x6 - x0) - (x5 - x3) + (x7 - x1) - (x4 - x2)); + fjxze = Real_t(.125) * ((x6 - x0) + (x5 - x3) + (x7 - x1) + (x4 - x2)); + + fjyxi = Real_t(.125) * ((y6 - y0) + (y5 - y3) - (y7 - y1) - (y4 - y2)); + fjyet = Real_t(.125) * ((y6 - y0) - (y5 - y3) + (y7 - y1) - (y4 - y2)); + fjyze = Real_t(.125) * ((y6 - y0) + (y5 - y3) + (y7 - y1) + (y4 - y2)); + + fjzxi = Real_t(.125) * ((z6 - z0) + (z5 - z3) - (z7 - z1) - (z4 - z2)); + fjzet = Real_t(.125) * ((z6 - z0) - (z5 - z3) + (z7 - z1) - (z4 - z2)); + fjzze = Real_t(.125) * ((z6 - z0) + (z5 - z3) + (z7 - z1) + (z4 - z2)); + + cjxxi = (fjyet * fjzze) - (fjzet * fjyze); + cjxet = -(fjyxi * fjzze) + (fjzxi * fjyze); + cjxze = (fjyxi * fjzet) - (fjzxi * fjyet); + + cjyxi = -(fjxet * fjzze) + (fjzet * fjxze); + cjyet = (fjxxi * fjzze) - (fjzxi * fjxze); + cjyze = -(fjxxi * fjzet) + (fjzxi * fjxet); + + cjzxi = (fjxet * fjyze) - (fjyet * fjxze); + cjzet = -(fjxxi * fjyze) + (fjyxi * fjxze); + cjzze = (fjxxi * fjyet) - (fjyxi * fjxet); + + b[0][0] = -cjxxi - cjxet - cjxze; + b[0][1] = cjxxi - cjxet - cjxze; + b[0][2] = cjxxi + cjxet - cjxze; + b[0][3] = -cjxxi + cjxet - cjxze; + b[0][4] = -b[0][2]; + b[0][5] = -b[0][3]; + b[0][6] = -b[0][0]; + b[0][7] = -b[0][1]; + + b[1][0] = -cjyxi - cjyet - cjyze; + b[1][1] = cjyxi - cjyet - cjyze; + b[1][2] = cjyxi + cjyet - cjyze; + b[1][3] = -cjyxi + cjyet - cjyze; + b[1][4] = -b[1][2]; + b[1][5] = -b[1][3]; + b[1][6] = -b[1][0]; + b[1][7] = -b[1][1]; + + b[2][0] = -cjzxi - cjzet - cjzze; + b[2][1] = cjzxi - cjzet - cjzze; + b[2][2] = cjzxi + cjzet - cjzze; + b[2][3] = -cjzxi + cjzet - cjzze; + b[2][4] = -b[2][2]; + b[2][5] = -b[2][3]; + b[2][6] = -b[2][0]; + b[2][7] = -b[2][1]; + + *volume = Real_t(8.) * (fjxet * cjxet + fjyet * cjyet + fjzet * cjzet); +} + +KOKKOS_INLINE_FUNCTION void +SumElemFaceNormal(Real_t* normalX0, Real_t* normalY0, Real_t* normalZ0, Real_t* normalX1, + Real_t* normalY1, Real_t* normalZ1, Real_t* normalX2, Real_t* normalY2, + Real_t* normalZ2, Real_t* normalX3, Real_t* normalY3, Real_t* normalZ3, + const Real_t x0, const Real_t y0, const Real_t z0, const Real_t x1, + const Real_t y1, const Real_t z1, const Real_t x2, const Real_t y2, + const Real_t z2, const Real_t x3, const Real_t y3, const Real_t z3) +{ + Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0); + Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0); + Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0); + Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0); + Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0); + Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0); + Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1); + Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1); + Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1); + + *normalX0 += areaX; + *normalX1 += areaX; + *normalX2 += areaX; + *normalX3 += areaX; + + *normalY0 += areaY; + *normalY1 += areaY; + *normalY2 += areaY; + *normalY3 += areaY; + + *normalZ0 += areaZ; + *normalZ1 += areaZ; + *normalZ2 += areaZ; + *normalZ3 += areaZ; +} + +KOKKOS_INLINE_FUNCTION void +CalcElemNodeNormals(Real_t pfx[8], Real_t pfy[8], Real_t pfz[8], const Real_t x[8], + const Real_t y[8], const Real_t z[8]) +{ + for(Index_t i = 0; i < 8; ++i) + { + pfx[i] = Real_t(0.0); + pfy[i] = Real_t(0.0); + pfz[i] = Real_t(0.0); + } + + SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], &pfx[1], &pfy[1], &pfz[1], &pfx[2], + &pfy[2], &pfz[2], &pfx[3], &pfy[3], &pfz[3], x[0], y[0], z[0], x[1], + y[1], z[1], x[2], y[2], z[2], x[3], y[3], z[3]); + + SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], &pfx[4], &pfy[4], &pfz[4], &pfx[5], + &pfy[5], &pfz[5], &pfx[1], &pfy[1], &pfz[1], x[0], y[0], z[0], x[4], + y[4], z[4], x[5], y[5], z[5], x[1], y[1], z[1]); + + SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1], &pfx[5], &pfy[5], &pfz[5], &pfx[6], + &pfy[6], &pfz[6], &pfx[2], &pfy[2], &pfz[2], x[1], y[1], z[1], x[5], + y[5], z[5], x[6], y[6], z[6], x[2], y[2], z[2]); + + SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2], &pfx[6], &pfy[6], &pfz[6], &pfx[7], + &pfy[7], &pfz[7], &pfx[3], &pfy[3], &pfz[3], x[2], y[2], z[2], x[6], + y[6], z[6], x[7], y[7], z[7], x[3], y[3], z[3]); + + SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3], &pfx[7], &pfy[7], &pfz[7], &pfx[4], + &pfy[4], &pfz[4], &pfx[0], &pfy[0], &pfz[0], x[3], y[3], z[3], x[7], + y[7], z[7], x[4], y[4], z[4], x[0], y[0], z[0]); + + SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4], &pfx[7], &pfy[7], &pfz[7], &pfx[6], + &pfy[6], &pfz[6], &pfx[5], &pfy[5], &pfz[5], x[4], y[4], z[4], x[7], + y[7], z[7], x[6], y[6], z[6], x[5], y[5], z[5]); +} + +KOKKOS_INLINE_FUNCTION void +SumElemStressesToNodeForces(const Real_t B[][8], const Real_t stress_xx, + const Real_t stress_yy, const Real_t stress_zz, Real_t fx[], + Real_t fy[], Real_t fz[]) +{ + for(Index_t i = 0; i < 8; i++) + { + fx[i] = -(stress_xx * B[0][i]); + fy[i] = -(stress_yy * B[1][i]); + fz[i] = -(stress_zz * B[2][i]); + } +} + +static inline void +IntegrateStressForElems(Domain& domain, Real_t* sigxx, Real_t* sigyy, Real_t* sigzz, + Real_t* determ, Index_t numElem, Index_t numNode) +{ + Index_t numElem8 = numElem * 8; + ResizeBuffer((numElem8 * sizeof(Real_t) + 4096) * 3); + Real_t* fx_elem = AllocateFromBuffer(numElem8); + Real_t* fy_elem = AllocateFromBuffer(numElem8); + Real_t* fz_elem = AllocateFromBuffer(numElem8); + + Kokkos::parallel_for( + "IntegrateStressForElems A", numElem, KOKKOS_LAMBDA(const int k) { + const Index_t* const elemToNode = domain.nodelist(k); + Real_t B[3][8]; + Real_t x_local[8]; + Real_t y_local[8]; + Real_t z_local[8]; + + CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local); + + CalcElemShapeFunctionDerivatives(x_local, y_local, z_local, B, &determ[k]); + + CalcElemNodeNormals(B[0], B[1], B[2], x_local, y_local, z_local); + + SumElemStressesToNodeForces(B, sigxx[k], sigyy[k], sigzz[k], &fx_elem[k * 8], + &fy_elem[k * 8], &fz_elem[k * 8]); + }); + +#ifdef KOKKOS_HAVE_CUDA + int team_size = + std::is_same::value ? 128 : 1; +#else + int team_size = 1; +#endif + Kokkos::parallel_for( + "IntegrateStressForElems B", + Kokkos::TeamPolicy<>((numNode + 127) / 128, team_size, 2), + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<>::member_type& team) { + const Index_t gnode_begin = team.league_rank() * 128; + const Index_t gnode_end = + (gnode_begin + 128 < numNode) ? gnode_begin + 128 : numNode; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, gnode_begin, gnode_end), + [&](const Index_t& gnode) { + Index_t count = domain.nodeElemCount(gnode); + Index_t* cornerList = domain.nodeElemCornerList(gnode); + reduce_double3 f_tmp; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, count), + [&](const Index_t& i, + reduce_double3& tmp) { // vectorized with ivdep + Index_t elem = cornerList[i]; + tmp.x += fx_elem[elem]; + tmp.y += fy_elem[elem]; + tmp.z += fz_elem[elem]; + }, + f_tmp); + Kokkos::single(Kokkos::PerThread(team), [&]() { + domain.fx(gnode) += f_tmp.x; + domain.fy(gnode) += f_tmp.y; + domain.fz(gnode) += f_tmp.z; + }); + }); + }); +} + +static inline void +VoluDer(const Real_t x0, const Real_t x1, const Real_t x2, const Real_t x3, + const Real_t x4, const Real_t x5, const Real_t y0, const Real_t y1, + const Real_t y2, const Real_t y3, const Real_t y4, const Real_t y5, + const Real_t z0, const Real_t z1, const Real_t z2, const Real_t z3, + const Real_t z4, const Real_t z5, Real_t& dvdx, Real_t& dvdy, Real_t& dvdz) +{ + const Real_t twelfth = Real_t(1.0) / Real_t(12.0); + + dvdx = (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) + (y0 + y4) * (z3 + z4) - + (y3 + y4) * (z0 + z4) - (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5); + dvdy = -(x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) - (x0 + x4) * (z3 + z4) + + (x3 + x4) * (z0 + z4) + (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5); + + dvdz = -(y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) - (y0 + y4) * (x3 + x4) + + (y3 + y4) * (x0 + x4) + (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5); + + dvdx *= twelfth; + dvdy *= twelfth; + dvdz *= twelfth; +} + +KOKKOS_INLINE_FUNCTION +void +CalcElemVolumeDerivative( + const Int_t& i, + const Kokkos::View>& dvdx, + const Kokkos::View>& dvdy, + const Kokkos::View>& dvdz, + const Real_t x[8], const Real_t y[8], const Real_t z[8]) +{ +#pragma nounroll + for(int j = 0; j < 4; j++) + { + VoluDer(x[(j + 1) % 4], x[(j + 2) % 4], x[(j + 3) % 4], x[(j + 0) % 4 + 4], + x[(j + 1) % 4 + 4], x[(j + 3) % 4 + 4], y[(j + 1) % 4], y[(j + 2) % 4], + y[(j + 3) % 4], y[(j + 0) % 4 + 4], y[(j + 1) % 4 + 4], + y[(j + 3) % 4 + 4], z[(j + 1) % 4], z[(j + 2) % 4], z[(j + 3) % 4], + z[(j + 0) % 4 + 4], z[(j + 1) % 4 + 4], z[(j + 3) % 4 + 4], dvdx(i, j), + dvdy(i, j), dvdz(i, j)); + VoluDer(x[(j + 3) % 4 + 4], x[(j + 2) % 4 + 4], x[(j + 1) % 4 + 4], + x[(j + 0) % 4], x[(j + 3) % 4], x[(j + 1) % 4], y[(j + 3) % 4 + 4], + y[(j + 2) % 4 + 4], y[(j + 1) % 4 + 4], y[(j + 0) % 4], y[(j + 3) % 4], + y[(j + 1) % 4], z[(j + 3) % 4 + 4], z[(j + 2) % 4 + 4], + z[(j + 1) % 4 + 4], z[(j + 0) % 4], z[(j + 3) % 4], z[(j + 1) % 4], + dvdx(i, j + 4), dvdy(i, j + 4), dvdz(i, j + 4)); + } +} + +KOKKOS_INLINE_FUNCTION +void +CalcElemFBHourglassForce(const Real_t* xd, const Real_t hourgam[][8], + const Real_t& coefficient, Real_t* hgfx) +{ + Real_t hxx[4]; + for(Index_t i = 0; i < 4; i++) + { + hxx[i] = hourgam[i][0] * xd[0] + hourgam[i][1] * xd[1] + hourgam[i][2] * xd[2] + + hourgam[i][3] * xd[3] + hourgam[i][4] * xd[4] + hourgam[i][5] * xd[5] + + hourgam[i][6] * xd[6] + hourgam[i][7] * xd[7]; + } + for(Index_t i = 0; i < 8; i++) + { + hgfx[i] = coefficient * (hourgam[0][i] * hxx[0] + hourgam[1][i] * hxx[1] + + hourgam[2][i] * hxx[2] + hourgam[3][i] * hxx[3]); + } +} + +struct Gamma +{ + Real_t gamma[4][8]; + Gamma() + { + gamma[0][0] = Real_t(1.); + gamma[0][1] = Real_t(1.); + gamma[0][2] = Real_t(-1.); + gamma[0][3] = Real_t(-1.); + gamma[0][4] = Real_t(-1.); + gamma[0][5] = Real_t(-1.); + gamma[0][6] = Real_t(1.); + gamma[0][7] = Real_t(1.); + gamma[1][0] = Real_t(1.); + gamma[1][1] = Real_t(-1.); + gamma[1][2] = Real_t(-1.); + gamma[1][3] = Real_t(1.); + gamma[1][4] = Real_t(-1.); + gamma[1][5] = Real_t(1.); + gamma[1][6] = Real_t(1.); + gamma[1][7] = Real_t(-1.); + gamma[2][0] = Real_t(1.); + gamma[2][1] = Real_t(-1.); + gamma[2][2] = Real_t(1.); + gamma[2][3] = Real_t(-1.); + gamma[2][4] = Real_t(1.); + gamma[2][5] = Real_t(-1.); + gamma[2][6] = Real_t(1.); + gamma[2][7] = Real_t(-1.); + gamma[3][0] = Real_t(-1.); + gamma[3][1] = Real_t(1.); + gamma[3][2] = Real_t(-1.); + gamma[3][3] = Real_t(1.); + gamma[3][4] = Real_t(1.); + gamma[3][5] = Real_t(-1.); + gamma[3][6] = Real_t(1.); + gamma[3][7] = Real_t(-1.); + } +}; + +static inline void +CalcFBHourglassForceForElems( + Domain& domain, Real_t* determ, + const Kokkos::View> x8n, + const Kokkos::View> y8n, + const Kokkos::View> z8n, + const Kokkos::View> dvdx, + const Kokkos::View> dvdy, + const Kokkos::View> dvdz, + Real_t hourg, Index_t numElem, Index_t numNode) +{ + Index_t numElem8 = numElem * 8; + + Real_t* fx_elem; + Real_t* fy_elem; + Real_t* fz_elem; + + if(do_atomic == 0) + { + fx_elem = AllocateFromBuffer(numElem8); + fy_elem = AllocateFromBuffer(numElem8); + fz_elem = AllocateFromBuffer(numElem8); + } + + Gamma G; + + Int_t do_atomic_dev = do_atomic; + + Kokkos::parallel_for( + "CalcFBHourglassForceForElems A", numElem, KOKKOS_LAMBDA(const int& i2) { + Real_t *fx_local, *fy_local, *fz_local; + Real_t hgfx[8]; + + Real_t hourgam[4][8]; + Real_t xd1[8]; + + const Index_t* elemToNode = domain.nodelist(i2); + Index_t i3 = 8 * i2; + Real_t volinv = Real_t(1.0) / determ[i2]; + + for(Index_t i1 = 0; i1 < 4; ++i1) + { + Real_t hourmodx = 0.0; + for(int j = 0; j < 8; j++) + hourmodx += x8n(i2, j) * G.gamma[i1][j]; + + Real_t hourmody = 0.0; + for(int j = 0; j < 8; j++) + hourmody += y8n(i2, j) * G.gamma[i1][j]; + + Real_t hourmodz = 0.0; + for(int j = 0; j < 8; j++) + hourmodz += z8n(i2, j) * G.gamma[i1][j]; + +#pragma ivdep + for(int j = 0; j < 8; j++) + hourgam[i1][j] = G.gamma[i1][j] - volinv * (dvdx(i2, j) * hourmodx + + dvdy(i2, j) * hourmody + + dvdz(i2, j) * hourmodz); + } + + const Real_t ss1 = domain.ss(i2); + const Real_t mass1 = domain.elemMass(i2); + const Real_t volume13 = CBRT(determ[i2]); + + const Index_t n0si2 = elemToNode[0]; + const Index_t n1si2 = elemToNode[1]; + const Index_t n2si2 = elemToNode[2]; + const Index_t n3si2 = elemToNode[3]; + const Index_t n4si2 = elemToNode[4]; + const Index_t n5si2 = elemToNode[5]; + const Index_t n6si2 = elemToNode[6]; + const Index_t n7si2 = elemToNode[7]; + + const Real_t coefficient = -hourg * Real_t(0.01) * ss1 * mass1 / volume13; + + xd1[0] = domain.xd(n0si2); + xd1[1] = domain.xd(n1si2); + xd1[2] = domain.xd(n2si2); + xd1[3] = domain.xd(n3si2); + xd1[4] = domain.xd(n4si2); + xd1[5] = domain.xd(n5si2); + xd1[6] = domain.xd(n6si2); + xd1[7] = domain.xd(n7si2); + + CalcElemFBHourglassForce(xd1, hourgam, coefficient, hgfx); + + if(!do_atomic_dev) + { + fx_local = &fx_elem[i3]; + fx_local[0] = hgfx[0]; + fx_local[1] = hgfx[1]; + fx_local[2] = hgfx[2]; + fx_local[3] = hgfx[3]; + fx_local[4] = hgfx[4]; + fx_local[5] = hgfx[5]; + fx_local[6] = hgfx[6]; + fx_local[7] = hgfx[7]; + } + else + { + Kokkos::atomic_add(&domain.fx(n0si2), hgfx[0]); + Kokkos::atomic_add(&domain.fx(n1si2), hgfx[1]); + Kokkos::atomic_add(&domain.fx(n2si2), hgfx[2]); + Kokkos::atomic_add(&domain.fx(n3si2), hgfx[3]); + Kokkos::atomic_add(&domain.fx(n4si2), hgfx[4]); + Kokkos::atomic_add(&domain.fx(n5si2), hgfx[5]); + Kokkos::atomic_add(&domain.fx(n6si2), hgfx[6]); + Kokkos::atomic_add(&domain.fx(n7si2), hgfx[7]); + } + + xd1[0] = domain.yd(n0si2); + xd1[1] = domain.yd(n1si2); + xd1[2] = domain.yd(n2si2); + xd1[3] = domain.yd(n3si2); + xd1[4] = domain.yd(n4si2); + xd1[5] = domain.yd(n5si2); + xd1[6] = domain.yd(n6si2); + xd1[7] = domain.yd(n7si2); + + CalcElemFBHourglassForce(xd1, hourgam, coefficient, hgfx); + + if(!do_atomic_dev) + { + fy_local = &fy_elem[i3]; + fy_local[0] = hgfx[0]; + fy_local[1] = hgfx[1]; + fy_local[2] = hgfx[2]; + fy_local[3] = hgfx[3]; + fy_local[4] = hgfx[4]; + fy_local[5] = hgfx[5]; + fy_local[6] = hgfx[6]; + fy_local[7] = hgfx[7]; + } + else + { + Kokkos::atomic_add(&domain.fy(n0si2), hgfx[0]); + Kokkos::atomic_add(&domain.fy(n1si2), hgfx[1]); + Kokkos::atomic_add(&domain.fy(n2si2), hgfx[2]); + Kokkos::atomic_add(&domain.fy(n3si2), hgfx[3]); + Kokkos::atomic_add(&domain.fy(n4si2), hgfx[4]); + Kokkos::atomic_add(&domain.fy(n5si2), hgfx[5]); + Kokkos::atomic_add(&domain.fy(n6si2), hgfx[6]); + Kokkos::atomic_add(&domain.fy(n7si2), hgfx[7]); + } + + xd1[0] = domain.zd(n0si2); + xd1[1] = domain.zd(n1si2); + xd1[2] = domain.zd(n2si2); + xd1[3] = domain.zd(n3si2); + xd1[4] = domain.zd(n4si2); + xd1[5] = domain.zd(n5si2); + xd1[6] = domain.zd(n6si2); + xd1[7] = domain.zd(n7si2); + + CalcElemFBHourglassForce(xd1, hourgam, coefficient, hgfx); + + if(!do_atomic_dev) + { + fz_local = &fz_elem[i3]; + fz_local[0] = hgfx[0]; + fz_local[1] = hgfx[1]; + fz_local[2] = hgfx[2]; + fz_local[3] = hgfx[3]; + fz_local[4] = hgfx[4]; + fz_local[5] = hgfx[5]; + fz_local[6] = hgfx[6]; + fz_local[7] = hgfx[7]; + } + else + { + Kokkos::atomic_add(&domain.fz(n0si2), hgfx[0]); + Kokkos::atomic_add(&domain.fz(n1si2), hgfx[1]); + Kokkos::atomic_add(&domain.fz(n2si2), hgfx[2]); + Kokkos::atomic_add(&domain.fz(n3si2), hgfx[3]); + Kokkos::atomic_add(&domain.fz(n4si2), hgfx[4]); + Kokkos::atomic_add(&domain.fz(n5si2), hgfx[5]); + Kokkos::atomic_add(&domain.fz(n6si2), hgfx[6]); + Kokkos::atomic_add(&domain.fz(n7si2), hgfx[7]); + } + }); + + if(!do_atomic) + { +#ifdef KOKKOS_HAVE_CUDA + int team_size = + std::is_same::value ? 128 : 1; +#else + int team_size = 1; +#endif + Kokkos::parallel_for( + "CalcFBHourglassForceForElems B", + Kokkos::TeamPolicy<>((numNode + 127) / 128, team_size, 2), + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<>::member_type& team) { + const Index_t gnode_begin = team.league_rank() * 128; + const Index_t gnode_end = + (gnode_begin + 128 < numNode) ? gnode_begin + 128 : numNode; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, gnode_begin, gnode_end), + [&](const Index_t& gnode) { + Index_t count = domain.nodeElemCount(gnode); + Index_t* cornerList = domain.nodeElemCornerList(gnode); + reduce_double3 f_tmp; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, count), + [&](const Index_t& i, + reduce_double3& tmp) { // vectorized with ivdep + Index_t elem = cornerList[i]; + tmp.x += fx_elem[elem]; + tmp.y += fy_elem[elem]; + tmp.z += fz_elem[elem]; + }, + f_tmp); + Kokkos::single(Kokkos::PerThread(team), [&]() { + domain.fx(gnode) += f_tmp.x; + domain.fy(gnode) += f_tmp.y; + domain.fz(gnode) += f_tmp.z; + }); + }); + }); + } +} + +static inline void +CalcHourglassControlForElems(Domain& domain, Real_t determ[], Real_t hgcoef) +{ + Index_t numElem = domain.numElem(); + Index_t numElem8 = numElem * 8; + ResizeBuffer((numElem8 * sizeof(Real_t) + 4096) * (do_atomic ? 6 : 9)); + + Real_t* dvdx = AllocateFromBuffer(numElem8); + Real_t* dvdy = AllocateFromBuffer(numElem8); + Real_t* dvdz = AllocateFromBuffer(numElem8); + Real_t* x8n = AllocateFromBuffer(numElem8); + Real_t* y8n = AllocateFromBuffer(numElem8); + Real_t* z8n = AllocateFromBuffer(numElem8); + Kokkos::View> v_x8n(x8n, numElem, + 8); + Kokkos::View> v_y8n(y8n, numElem, + 8); + Kokkos::View> v_z8n(z8n, numElem, + 8); + Kokkos::View> v_dvdx(dvdx, numElem, + 8); + Kokkos::View> v_dvdy(dvdy, numElem, + 8); + Kokkos::View> v_dvdz(dvdz, numElem, + 8); + + int error = 0; + Kokkos::parallel_reduce( + numElem, + KOKKOS_LAMBDA(const int i, int& err) { + Real_t x1[8], y1[8], z1[8]; + + Index_t* elemToNode = domain.nodelist(i); + CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1); + + CalcElemVolumeDerivative(i, v_dvdx, v_dvdy, v_dvdz, x1, y1, z1); + + for(Index_t ii = 0; ii < 8; ++ii) + { + v_x8n(i, ii) = x1[ii]; + v_y8n(i, ii) = y1[ii]; + v_z8n(i, ii) = z1[ii]; + } + + determ[i] = domain.volo(i) * domain.v(i); + + if(domain.v(i) <= Real_t(0.0)) + { + err++; + } + }, + error); + + if(error) +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, VolumeError); +#else + exit(VolumeError); +#endif + + if(hgcoef > Real_t(0.)) + { + CalcFBHourglassForceForElems(domain, determ, v_x8n, v_y8n, v_z8n, v_dvdx, v_dvdy, + v_dvdz, hgcoef, numElem, domain.numNode()); + } + + return; +} + +static inline void +CalcVolumeForceForElems(Domain& domain) +{ + Index_t numElem = domain.numElem(); + if(numElem != 0) + { + Real_t hgcoef = domain.hgcoef(); + Real_t* sigxx = Allocate(numElem); + Real_t* sigyy = Allocate(numElem); + Real_t* sigzz = Allocate(numElem); + Real_t* determ = Allocate(numElem); + + InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem); + + IntegrateStressForElems(domain, sigxx, sigyy, sigzz, determ, numElem, + domain.numNode()); + + // check for negative element volume + int error = 0; + Kokkos::parallel_reduce(numElem, + KOKKOS_LAMBDA(const int k, int& err) { + if(determ[k] <= Real_t(0.0)) + { + err++; + } + }, + error); + + if(error) +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, VolumeError); +#else + exit(VolumeError); +#endif + + CalcHourglassControlForElems(domain, determ, hgcoef); + + Release(&determ); + Release(&sigzz); + Release(&sigyy); + Release(&sigxx); + } +} + +static inline void +CalcForceForNodes(Domain& domain) +{ + Index_t numNode = domain.numNode(); + +#if USE_MPI + CommRecv(domain, MSG_COMM_SBN, 3, domain.sizeX() + 1, domain.sizeY() + 1, + domain.sizeZ() + 1, true, false); +#endif + + Kokkos::parallel_for("CalcForceForNodes", numNode, KOKKOS_LAMBDA(const int i) { + domain.fx(i) = Real_t(0.0); + domain.fy(i) = Real_t(0.0); + domain.fz(i) = Real_t(0.0); + }); + + CalcVolumeForceForElems(domain); + +#if USE_MPI + Domain_member fieldData[3]; + fieldData[0] = &Domain::fx; + fieldData[1] = &Domain::fy; + fieldData[2] = &Domain::fz; + + CommSend(domain, MSG_COMM_SBN, 3, fieldData, domain.sizeX() + 1, domain.sizeY() + 1, + domain.sizeZ() + 1, true, false); + CommSBN(domain, 3, fieldData); +#endif +} + +static inline void +CalcAccelerationForNodes(Domain& domain, Index_t numNode) +{ + Kokkos::parallel_for("CalcAccelerationForNodes", numNode, KOKKOS_LAMBDA(const int i) { + domain.xdd(i) = domain.fx(i) / domain.nodalMass(i); + domain.ydd(i) = domain.fy(i) / domain.nodalMass(i); + domain.zdd(i) = domain.fz(i) / domain.nodalMass(i); + }); +} + +static inline void +ApplyAccelerationBoundaryConditionsForNodes(Domain& domain) +{ + Index_t size = domain.sizeX(); + Index_t numNodeBC = (size + 1) * (size + 1); + + if(!domain.symmXempty() != 0) + { + Kokkos::parallel_for( + "ApplyAccelerationBoundaryConditionsForNodes A", numNodeBC, + KOKKOS_LAMBDA(const int i) { domain.xdd(domain.symmX(i)) = Real_t(0.0); }); + } + + if(!domain.symmYempty() != 0) + { + Kokkos::parallel_for( + "ApplyAccelerationBoundaryConditionsForNodes B", numNodeBC, + KOKKOS_LAMBDA(const int i) { domain.ydd(domain.symmY(i)) = Real_t(0.0); }); + } + + if(!domain.symmZempty() != 0) + { + Kokkos::parallel_for( + "ApplyAccelerationBoundaryConditionsForNodes C", numNodeBC, + KOKKOS_LAMBDA(const int i) { domain.zdd(domain.symmZ(i)) = Real_t(0.0); }); + } +} + +static inline void +CalcVelocityForNodes(Domain& domain, const Real_t dt, const Real_t u_cut, Index_t numNode) +{ + Kokkos::parallel_for("CalcVelocityForNodes", numNode, KOKKOS_LAMBDA(const int i) { + Real_t xdtmp, ydtmp, zdtmp; + + xdtmp = domain.xd(i) + domain.xdd(i) * dt; + if(FABS(xdtmp) < u_cut) + xdtmp = Real_t(0.0); + domain.xd(i) = xdtmp; + + ydtmp = domain.yd(i) + domain.ydd(i) * dt; + if(FABS(ydtmp) < u_cut) + ydtmp = Real_t(0.0); + domain.yd(i) = ydtmp; + + zdtmp = domain.zd(i) + domain.zdd(i) * dt; + if(FABS(zdtmp) < u_cut) + zdtmp = Real_t(0.0); + domain.zd(i) = zdtmp; + }); +} + +static inline void +CalcPositionForNodes(Domain& domain, const Real_t dt, Index_t numNode) +{ + Kokkos::parallel_for("CalcPositionForNodes", numNode, KOKKOS_LAMBDA(const int i) { + domain.x(i) += domain.xd(i) * dt; + domain.y(i) += domain.yd(i) * dt; + domain.z(i) += domain.zd(i) * dt; + }); +} + +static inline void +LagrangeNodal(Domain& domain) +{ +#ifdef SEDOV_SYNC_POS_VEL_EARLY + Domain_member fieldData[6]; +#endif + + const Real_t delt = domain.deltatime(); + Real_t u_cut = domain.u_cut(); + + CalcForceForNodes(domain); + +#if USE_MPI +# ifdef SEDOV_SYNC_POS_VEL_EARLY + CommRecv(domain, MSG_SYNC_POS_VEL, 6, domain.sizeX() + 1, domain.sizeY() + 1, + domain.sizeZ() + 1, false, false); +# endif +#endif + + CalcAccelerationForNodes(domain, domain.numNode()); + + ApplyAccelerationBoundaryConditionsForNodes(domain); + + CalcVelocityForNodes(domain, delt, u_cut, domain.numNode()); + + CalcPositionForNodes(domain, delt, domain.numNode()); +#if USE_MPI +# ifdef SEDOV_SYNC_POS_VEL_EARLY + fieldData[0] = &Domain::x; + fieldData[1] = &Domain::y; + fieldData[2] = &Domain::z; + fieldData[3] = &Domain::xd; + fieldData[4] = &Domain::yd; + fieldData[5] = &Domain::zd; + + CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData, domain.sizeX() + 1, + domain.sizeY() + 1, domain.sizeZ() + 1, false, false); + CommSyncPosVel(domain); +# endif +#endif + + return; +} + +KOKKOS_INLINE_FUNCTION Real_t + CalcElemVolume(const Real_t x0, const Real_t x1, const Real_t x2, const Real_t x3, + const Real_t x4, const Real_t x5, const Real_t x6, const Real_t x7, + const Real_t y0, const Real_t y1, const Real_t y2, const Real_t y3, + const Real_t y4, const Real_t y5, const Real_t y6, const Real_t y7, + const Real_t z0, const Real_t z1, const Real_t z2, const Real_t z3, + const Real_t z4, const Real_t z5, const Real_t z6, const Real_t z7) +{ + Real_t twelveth = Real_t(1.0) / Real_t(12.0); + + Real_t dx61 = x6 - x1; + Real_t dy61 = y6 - y1; + Real_t dz61 = z6 - z1; + + Real_t dx70 = x7 - x0; + Real_t dy70 = y7 - y0; + Real_t dz70 = z7 - z0; + + Real_t dx63 = x6 - x3; + Real_t dy63 = y6 - y3; + Real_t dz63 = z6 - z3; + + Real_t dx20 = x2 - x0; + Real_t dy20 = y2 - y0; + Real_t dz20 = z2 - z0; + + Real_t dx50 = x5 - x0; + Real_t dy50 = y5 - y0; + Real_t dz50 = z5 - z0; + + Real_t dx64 = x6 - x4; + Real_t dy64 = y6 - y4; + Real_t dz64 = z6 - z4; + + Real_t dx31 = x3 - x1; + Real_t dy31 = y3 - y1; + Real_t dz31 = z3 - z1; + + Real_t dx72 = x7 - x2; + Real_t dy72 = y7 - y2; + Real_t dz72 = z7 - z2; + + Real_t dx43 = x4 - x3; + Real_t dy43 = y4 - y3; + Real_t dz43 = z4 - z3; + + Real_t dx57 = x5 - x7; + Real_t dy57 = y5 - y7; + Real_t dz57 = z5 - z7; + + Real_t dx14 = x1 - x4; + Real_t dy14 = y1 - y4; + Real_t dz14 = z1 - z4; + + Real_t dx25 = x2 - x5; + Real_t dy25 = y2 - y5; + Real_t dz25 = z2 - z5; + +#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \ + ((x1) * ((y2) * (z3) - (z2) * (y3)) + (x2) * ((z1) * (y3) - (y1) * (z3)) + \ + (x3) * ((y1) * (z2) - (z1) * (y2))) + + Real_t volume = TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20, dy31 + dy72, dy63, dy20, + dz31 + dz72, dz63, dz20) + + TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70, dy43 + dy57, dy64, dy70, + dz43 + dz57, dz64, dz70) + + TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50, dy14 + dy25, dy61, dy50, + dz14 + dz25, dz61, dz50); + +#undef TRIPLE_PRODUCT + + volume *= twelveth; + + return volume; +} + +KOKKOS_INLINE_FUNCTION +Real_t +CalcElemVolume(const Real_t x[8], const Real_t y[8], const Real_t z[8]) +{ + return CalcElemVolume(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], y[0], y[1], + y[2], y[3], y[4], y[5], y[6], y[7], z[0], z[1], z[2], z[3], + z[4], z[5], z[6], z[7]); +} + +static inline Real_t +AreaFace(const Real_t x0, const Real_t x1, const Real_t x2, const Real_t x3, + const Real_t y0, const Real_t y1, const Real_t y2, const Real_t y3, + const Real_t z0, const Real_t z1, const Real_t z2, const Real_t z3) +{ + Real_t fx = (x2 - x0) - (x3 - x1); + Real_t fy = (y2 - y0) - (y3 - y1); + Real_t fz = (z2 - z0) - (z3 - z1); + Real_t gx = (x2 - x0) + (x3 - x1); + Real_t gy = (y2 - y0) + (y3 - y1); + Real_t gz = (z2 - z0) + (z3 - z1); + Real_t area = (fx * fx + fy * fy + fz * fz) * (gx * gx + gy * gy + gz * gz) - + (fx * gx + fy * gy + fz * gz) * (fx * gx + fy * gy + fz * gz); + return area; +} + +KOKKOS_INLINE_FUNCTION Real_t + CalcElemCharacteristicLength(const Real_t x[8], const Real_t y[8], const Real_t z[8], + const Real_t volume) +{ + Real_t a, charLength = Real_t(0.0); + + a = AreaFace(x[0], x[1], x[2], x[3], y[0], y[1], y[2], y[3], z[0], z[1], z[2], z[3]); + charLength = MAX(a, charLength); + + a = AreaFace(x[4], x[5], x[6], x[7], y[4], y[5], y[6], y[7], z[4], z[5], z[6], z[7]); + charLength = MAX(a, charLength); + + a = AreaFace(x[0], x[1], x[5], x[4], y[0], y[1], y[5], y[4], z[0], z[1], z[5], z[4]); + charLength = MAX(a, charLength); + + a = AreaFace(x[1], x[2], x[6], x[5], y[1], y[2], y[6], y[5], z[1], z[2], z[6], z[5]); + charLength = MAX(a, charLength); + + a = AreaFace(x[2], x[3], x[7], x[6], y[2], y[3], y[7], y[6], z[2], z[3], z[7], z[6]); + charLength = MAX(a, charLength); + + a = AreaFace(x[3], x[0], x[4], x[7], y[3], y[0], y[4], y[7], z[3], z[0], z[4], z[7]); + charLength = MAX(a, charLength); + + charLength = Real_t(4.0) * volume / SQRT(charLength); + + return charLength; +} + +static inline void +CalcElemVelocityGradient(const Real_t* const xvel, const Real_t* const yvel, + const Real_t* const zvel, const Real_t b[][8], const Real_t detJ, + Real_t* const d) +{ + const Real_t inv_detJ = Real_t(1.0) / detJ; + Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz; + const Real_t* const pfx = b[0]; + const Real_t* const pfy = b[1]; + const Real_t* const pfz = b[2]; + + d[0] = inv_detJ * (pfx[0] * (xvel[0] - xvel[6]) + pfx[1] * (xvel[1] - xvel[7]) + + pfx[2] * (xvel[2] - xvel[4]) + pfx[3] * (xvel[3] - xvel[5])); + + d[1] = inv_detJ * (pfy[0] * (yvel[0] - yvel[6]) + pfy[1] * (yvel[1] - yvel[7]) + + pfy[2] * (yvel[2] - yvel[4]) + pfy[3] * (yvel[3] - yvel[5])); + + d[2] = inv_detJ * (pfz[0] * (zvel[0] - zvel[6]) + pfz[1] * (zvel[1] - zvel[7]) + + pfz[2] * (zvel[2] - zvel[4]) + pfz[3] * (zvel[3] - zvel[5])); + + dyddx = inv_detJ * (pfx[0] * (yvel[0] - yvel[6]) + pfx[1] * (yvel[1] - yvel[7]) + + pfx[2] * (yvel[2] - yvel[4]) + pfx[3] * (yvel[3] - yvel[5])); + + dxddy = inv_detJ * (pfy[0] * (xvel[0] - xvel[6]) + pfy[1] * (xvel[1] - xvel[7]) + + pfy[2] * (xvel[2] - xvel[4]) + pfy[3] * (xvel[3] - xvel[5])); + + dzddx = inv_detJ * (pfx[0] * (zvel[0] - zvel[6]) + pfx[1] * (zvel[1] - zvel[7]) + + pfx[2] * (zvel[2] - zvel[4]) + pfx[3] * (zvel[3] - zvel[5])); + + dxddz = inv_detJ * (pfz[0] * (xvel[0] - xvel[6]) + pfz[1] * (xvel[1] - xvel[7]) + + pfz[2] * (xvel[2] - xvel[4]) + pfz[3] * (xvel[3] - xvel[5])); + + dzddy = inv_detJ * (pfy[0] * (zvel[0] - zvel[6]) + pfy[1] * (zvel[1] - zvel[7]) + + pfy[2] * (zvel[2] - zvel[4]) + pfy[3] * (zvel[3] - zvel[5])); + + dyddz = inv_detJ * (pfz[0] * (yvel[0] - yvel[6]) + pfz[1] * (yvel[1] - yvel[7]) + + pfz[2] * (yvel[2] - yvel[4]) + pfz[3] * (yvel[3] - yvel[5])); + d[5] = Real_t(.5) * (dxddy + dyddx); + d[4] = Real_t(.5) * (dxddz + dzddx); + d[3] = Real_t(.5) * (dzddy + dyddz); +} + +void +CalcKinematicsForElems(Domain& domain, Real_t deltaTime, Index_t numElem) +{ + Kokkos::parallel_for("CalcKinematicsForElems", numElem, KOKKOS_LAMBDA(const int k) { + Real_t B[3][8]; + Real_t D[6]; + Real_t x_local[8]; + Real_t y_local[8]; + Real_t z_local[8]; + Real_t xd_local[8]; + Real_t yd_local[8]; + Real_t zd_local[8]; + Real_t detJ = Real_t(0.0); + + Real_t volume; + Real_t relativeVolume; + const Index_t* const elemToNode = domain.nodelist(k); + + CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local); + + volume = CalcElemVolume(x_local, y_local, z_local); + relativeVolume = volume / domain.volo(k); + domain.vnew(k) = relativeVolume; + domain.delv(k) = relativeVolume - domain.v(k); + + domain.arealg(k) = + CalcElemCharacteristicLength(x_local, y_local, z_local, volume); + + for(Index_t lnode = 0; lnode < 8; ++lnode) + { + Index_t gnode = elemToNode[lnode]; + xd_local[lnode] = domain.c_xd(gnode); + yd_local[lnode] = domain.c_yd(gnode); + zd_local[lnode] = domain.c_zd(gnode); + } + + Real_t dt2 = Real_t(0.5) * deltaTime; + for(Index_t j = 0; j < 8; ++j) + { + x_local[j] -= dt2 * xd_local[j]; + y_local[j] -= dt2 * yd_local[j]; + z_local[j] -= dt2 * zd_local[j]; + } + + CalcElemShapeFunctionDerivatives(x_local, y_local, z_local, B, &detJ); + + CalcElemVelocityGradient(xd_local, yd_local, zd_local, B, detJ, D); + + domain.dxx(k) = D[0]; + domain.dyy(k) = D[1]; + domain.dzz(k) = D[2]; + }); +} + +static inline void +CalcLagrangeElements(Domain& domain) +{ + Index_t numElem = domain.numElem(); + if(numElem > 0) + { + const Real_t deltatime = domain.deltatime(); + + domain.AllocateStrains(numElem); + + CalcKinematicsForElems(domain, deltatime, numElem); + + int error = 0; + Kokkos::parallel_reduce(numElem, + KOKKOS_LAMBDA(const int k, int& err) { + Real_t vdov = + domain.dxx(k) + domain.dyy(k) + domain.dzz(k); + Real_t vdovthird = vdov / Real_t(3.0); + + domain.vdov(k) = vdov; + domain.dxx(k) -= vdovthird; + domain.dyy(k) -= vdovthird; + domain.dzz(k) -= vdovthird; + + if(domain.vnew(k) <= Real_t(0.0)) + { + err++; + } + }, + error); + + if(error) +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, VolumeError); +#else + exit(VolumeError); +#endif + + domain.DeallocateStrains(); + } +} + +static inline void +CalcMonotonicQGradientsForElems(Domain& domain) +{ + Index_t numElem = domain.numElem(); + + Kokkos::parallel_for( + "CalcMonotonicQGradientsForElems", numElem, KOKKOS_LAMBDA(const int i) { + const Real_t ptiny = Real_t(1.e-36); + Real_t ax, ay, az; + Real_t dxv, dyv, dzv; + + const Index_t* elemToNode = domain.nodelist(i); + Index_t n0 = elemToNode[0]; + Index_t n1 = elemToNode[1]; + Index_t n2 = elemToNode[2]; + Index_t n3 = elemToNode[3]; + Index_t n4 = elemToNode[4]; + Index_t n5 = elemToNode[5]; + Index_t n6 = elemToNode[6]; + Index_t n7 = elemToNode[7]; + + Real_t x0 = domain.x(n0); + Real_t x1 = domain.x(n1); + Real_t x2 = domain.x(n2); + Real_t x3 = domain.x(n3); + Real_t x4 = domain.x(n4); + Real_t x5 = domain.x(n5); + Real_t x6 = domain.x(n6); + Real_t x7 = domain.x(n7); + + Real_t y0 = domain.y(n0); + Real_t y1 = domain.y(n1); + Real_t y2 = domain.y(n2); + Real_t y3 = domain.y(n3); + Real_t y4 = domain.y(n4); + Real_t y5 = domain.y(n5); + Real_t y6 = domain.y(n6); + Real_t y7 = domain.y(n7); + + Real_t z0 = domain.z(n0); + Real_t z1 = domain.z(n1); + Real_t z2 = domain.z(n2); + Real_t z3 = domain.z(n3); + Real_t z4 = domain.z(n4); + Real_t z5 = domain.z(n5); + Real_t z6 = domain.z(n6); + Real_t z7 = domain.z(n7); + + Real_t xv0 = domain.xd(n0); + Real_t xv1 = domain.xd(n1); + Real_t xv2 = domain.xd(n2); + Real_t xv3 = domain.xd(n3); + Real_t xv4 = domain.xd(n4); + Real_t xv5 = domain.xd(n5); + Real_t xv6 = domain.xd(n6); + Real_t xv7 = domain.xd(n7); + + Real_t yv0 = domain.yd(n0); + Real_t yv1 = domain.yd(n1); + Real_t yv2 = domain.yd(n2); + Real_t yv3 = domain.yd(n3); + Real_t yv4 = domain.yd(n4); + Real_t yv5 = domain.yd(n5); + Real_t yv6 = domain.yd(n6); + Real_t yv7 = domain.yd(n7); + + Real_t zv0 = domain.zd(n0); + Real_t zv1 = domain.zd(n1); + Real_t zv2 = domain.zd(n2); + Real_t zv3 = domain.zd(n3); + Real_t zv4 = domain.zd(n4); + Real_t zv5 = domain.zd(n5); + Real_t zv6 = domain.zd(n6); + Real_t zv7 = domain.zd(n7); + + Real_t vol = domain.volo(i) * domain.vnew(i); + Real_t norm = Real_t(1.0) / (vol + ptiny); + + Real_t dxj = Real_t(-0.25) * ((x0 + x1 + x5 + x4) - (x3 + x2 + x6 + x7)); + Real_t dyj = Real_t(-0.25) * ((y0 + y1 + y5 + y4) - (y3 + y2 + y6 + y7)); + Real_t dzj = Real_t(-0.25) * ((z0 + z1 + z5 + z4) - (z3 + z2 + z6 + z7)); + + Real_t dxi = Real_t(0.25) * ((x1 + x2 + x6 + x5) - (x0 + x3 + x7 + x4)); + Real_t dyi = Real_t(0.25) * ((y1 + y2 + y6 + y5) - (y0 + y3 + y7 + y4)); + Real_t dzi = Real_t(0.25) * ((z1 + z2 + z6 + z5) - (z0 + z3 + z7 + z4)); + + Real_t dxk = Real_t(0.25) * ((x4 + x5 + x6 + x7) - (x0 + x1 + x2 + x3)); + Real_t dyk = Real_t(0.25) * ((y4 + y5 + y6 + y7) - (y0 + y1 + y2 + y3)); + Real_t dzk = Real_t(0.25) * ((z4 + z5 + z6 + z7) - (z0 + z1 + z2 + z3)); + + ax = dyi * dzj - dzi * dyj; + ay = dzi * dxj - dxi * dzj; + az = dxi * dyj - dyi * dxj; + + domain.delx_zeta(i) = vol / SQRT(ax * ax + ay * ay + az * az + ptiny); + + ax *= norm; + ay *= norm; + az *= norm; + + dxv = Real_t(0.25) * ((xv4 + xv5 + xv6 + xv7) - (xv0 + xv1 + xv2 + xv3)); + dyv = Real_t(0.25) * ((yv4 + yv5 + yv6 + yv7) - (yv0 + yv1 + yv2 + yv3)); + dzv = Real_t(0.25) * ((zv4 + zv5 + zv6 + zv7) - (zv0 + zv1 + zv2 + zv3)); + + domain.delv_zeta(i) = ax * dxv + ay * dyv + az * dzv; + + ax = dyj * dzk - dzj * dyk; + ay = dzj * dxk - dxj * dzk; + az = dxj * dyk - dyj * dxk; + + domain.delx_xi(i) = vol / SQRT(ax * ax + ay * ay + az * az + ptiny); + + ax *= norm; + ay *= norm; + az *= norm; + + dxv = Real_t(0.25) * ((xv1 + xv2 + xv6 + xv5) - (xv0 + xv3 + xv7 + xv4)); + dyv = Real_t(0.25) * ((yv1 + yv2 + yv6 + yv5) - (yv0 + yv3 + yv7 + yv4)); + dzv = Real_t(0.25) * ((zv1 + zv2 + zv6 + zv5) - (zv0 + zv3 + zv7 + zv4)); + + domain.delv_xi(i) = ax * dxv + ay * dyv + az * dzv; + + ax = dyk * dzi - dzk * dyi; + ay = dzk * dxi - dxk * dzi; + az = dxk * dyi - dyk * dxi; + + domain.delx_eta(i) = vol / SQRT(ax * ax + ay * ay + az * az + ptiny); + + ax *= norm; + ay *= norm; + az *= norm; + + dxv = Real_t(-0.25) * ((xv0 + xv1 + xv5 + xv4) - (xv3 + xv2 + xv6 + xv7)); + dyv = Real_t(-0.25) * ((yv0 + yv1 + yv5 + yv4) - (yv3 + yv2 + yv6 + yv7)); + dzv = Real_t(-0.25) * ((zv0 + zv1 + zv5 + zv4) - (zv3 + zv2 + zv6 + zv7)); + + domain.delv_eta(i) = ax * dxv + ay * dyv + az * dzv; + }); +} + +static inline void +CalcMonotonicQRegionForElems(Domain& domain, Int_t r, Real_t ptiny) +{ + Real_t monoq_limiter_mult = domain.monoq_limiter_mult(); + Real_t monoq_max_slope = domain.monoq_max_slope(); + Real_t qlc_monoq = domain.qlc_monoq(); + Real_t qqc_monoq = domain.qqc_monoq(); + + Kokkos::parallel_for( + "CalcMonotonicQRegionForElems", domain.regElemSize(r), + KOKKOS_LAMBDA(const int i) { + Index_t ielem = domain.regElemlist(r, i); + Real_t qlin, qquad; + Real_t phixi, phieta, phizeta; + Int_t bcMask = domain.elemBC(ielem); + Real_t delvm = 0.0, delvp = 0.0; + + Real_t norm = Real_t(1.) / (domain.delv_xi(ielem) + ptiny); + + switch(bcMask & XI_M) + { + case XI_M_COMM: + case 0: delvm = domain.delv_xi(domain.lxim(ielem)); break; + case XI_M_SYMM: delvm = domain.delv_xi(ielem); break; + case XI_M_FREE: delvm = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvm = 0; + break; + } + switch(bcMask & XI_P) + { + case XI_P_COMM: + case 0: delvp = domain.delv_xi(domain.lxip(ielem)); break; + case XI_P_SYMM: delvp = domain.delv_xi(ielem); break; + case XI_P_FREE: delvp = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvp = 0; + break; + } + + delvm = delvm * norm; + delvp = delvp * norm; + + phixi = Real_t(.5) * (delvm + delvp); + + delvm *= monoq_limiter_mult; + delvp *= monoq_limiter_mult; + + if(delvm < phixi) + phixi = delvm; + if(delvp < phixi) + phixi = delvp; + if(phixi < Real_t(0.)) + phixi = Real_t(0.); + if(phixi > monoq_max_slope) + phixi = monoq_max_slope; + + norm = Real_t(1.) / (domain.delv_eta(ielem) + ptiny); + + switch(bcMask & ETA_M) + { + case ETA_M_COMM: + case 0: delvm = domain.delv_eta(domain.letam(ielem)); break; + case ETA_M_SYMM: delvm = domain.delv_eta(ielem); break; + case ETA_M_FREE: delvm = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvm = 0; + break; + } + switch(bcMask & ETA_P) + { + case ETA_P_COMM: + case 0: delvp = domain.delv_eta(domain.letap(ielem)); break; + case ETA_P_SYMM: delvp = domain.delv_eta(ielem); break; + case ETA_P_FREE: delvp = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvp = 0; + break; + } + + delvm = delvm * norm; + delvp = delvp * norm; + + phieta = Real_t(.5) * (delvm + delvp); + + delvm *= monoq_limiter_mult; + delvp *= monoq_limiter_mult; + + if(delvm < phieta) + phieta = delvm; + if(delvp < phieta) + phieta = delvp; + if(phieta < Real_t(0.)) + phieta = Real_t(0.); + if(phieta > monoq_max_slope) + phieta = monoq_max_slope; + + norm = Real_t(1.) / (domain.delv_zeta(ielem) + ptiny); + + switch(bcMask & ZETA_M) + { + case ZETA_M_COMM: + case 0: delvm = domain.delv_zeta(domain.lzetam(ielem)); break; + case ZETA_M_SYMM: delvm = domain.delv_zeta(ielem); break; + case ZETA_M_FREE: delvm = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvm = 0; + break; + } + switch(bcMask & ZETA_P) + { + case ZETA_P_COMM: + case 0: delvp = domain.delv_zeta(domain.lzetap(ielem)); break; + case ZETA_P_SYMM: delvp = domain.delv_zeta(ielem); break; + case ZETA_P_FREE: delvp = Real_t(0.0); break; + default: + printf("Error in switch at %s line %d\n", __FILE__, __LINE__); + delvp = 0; + break; + } + + delvm = delvm * norm; + delvp = delvp * norm; + + phizeta = Real_t(.5) * (delvm + delvp); + + delvm *= monoq_limiter_mult; + delvp *= monoq_limiter_mult; + + if(delvm < phizeta) + phizeta = delvm; + if(delvp < phizeta) + phizeta = delvp; + if(phizeta < Real_t(0.)) + phizeta = Real_t(0.); + if(phizeta > monoq_max_slope) + phizeta = monoq_max_slope; + + if(domain.vdov(ielem) > Real_t(0.)) + { + qlin = Real_t(0.); + qquad = Real_t(0.); + } + else + { + Real_t delvxxi = domain.delv_xi(ielem) * domain.delx_xi(ielem); + Real_t delvxeta = domain.delv_eta(ielem) * domain.delx_eta(ielem); + Real_t delvxzeta = domain.delv_zeta(ielem) * domain.delx_zeta(ielem); + + if(delvxxi > Real_t(0.)) + delvxxi = Real_t(0.); + if(delvxeta > Real_t(0.)) + delvxeta = Real_t(0.); + if(delvxzeta > Real_t(0.)) + delvxzeta = Real_t(0.); + + Real_t rho = + domain.elemMass(ielem) / (domain.volo(ielem) * domain.vnew(ielem)); + + qlin = + -qlc_monoq * rho * + (delvxxi * (Real_t(1.) - phixi) + delvxeta * (Real_t(1.) - phieta) + + delvxzeta * (Real_t(1.) - phizeta)); + + qquad = qqc_monoq * rho * + (delvxxi * delvxxi * (Real_t(1.) - phixi * phixi) + + delvxeta * delvxeta * (Real_t(1.) - phieta * phieta) + + delvxzeta * delvxzeta * (Real_t(1.) - phizeta * phizeta)); + } + + domain.qq(ielem) = qquad; + domain.ql(ielem) = qlin; + }); +} + +static inline void +CalcMonotonicQForElems(Domain& domain) +{ + const Real_t ptiny = Real_t(1.e-36); + + for(Index_t r = 0; r < domain.numReg(); ++r) + { + if(domain.regElemSize(r) > 0) + { + CalcMonotonicQRegionForElems(domain, r, ptiny); + } + } +} + +static inline void +CalcQForElems(Domain& domain) +{ + Index_t numElem = domain.numElem(); + + if(numElem != 0) + { + Int_t allElem = numElem + /* local elem */ + 2 * domain.sizeX() * domain.sizeY() + /* plane ghosts */ + 2 * domain.sizeX() * domain.sizeZ() + /* row ghosts */ + 2 * domain.sizeY() * domain.sizeZ(); /* col ghosts */ + + domain.AllocateGradients(numElem, allElem); + +#if USE_MPI + CommRecv(domain, MSG_MONOQ, 3, domain.sizeX(), domain.sizeY(), domain.sizeZ(), + true, true); +#endif + CalcMonotonicQGradientsForElems(domain); + +#if USE_MPI + Domain_member fieldData[3]; + + fieldData[0] = &Domain::delv_xi; + fieldData[1] = &Domain::delv_eta; + fieldData[2] = &Domain::delv_zeta; + + CommSend(domain, MSG_MONOQ, 3, fieldData, domain.sizeX(), domain.sizeY(), + domain.sizeZ(), true, true); + + CommMonoQ(domain); +#endif + + CalcMonotonicQForElems(domain); + + domain.DeallocateGradients(); + + Index_t idx = 0; + Kokkos::parallel_reduce(numElem, + KOKKOS_LAMBDA(const Index_t& i, Index_t& count) { + if(domain.q(i) > domain.qstop()) + { + count++; + } + }, + idx); + + if(idx > 0) + { +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, QStopError); +#else + exit(QStopError); +#endif + } + } +} + +KOKKOS_INLINE_FUNCTION +void +CalcPressureForElem(Real_t& p_new_i, Real_t& bvc_i, Real_t& pbvc_i, const Real_t& e_old_i, + const Real_t& compression_i, const Real_t& vnewc_e, + const Real_t& pmin, const Real_t& p_cut, const Real_t& eosvmax) +{ + const Real_t c1s = Real_t(2.0) / Real_t(3.0); + bvc_i = c1s * (compression_i + Real_t(1.)); + + pbvc_i = c1s; + + p_new_i = bvc_i * e_old_i; + + if(FABS(p_new_i) < p_cut) + p_new_i = Real_t(0.0); + + if(vnewc_e >= eosvmax) /* impossible condition here? */ + p_new_i = Real_t(0.0); + + if(p_new_i < pmin) + p_new_i = pmin; +} + +static inline void +CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new, Real_t* bvc, Real_t* pbvc, + Real_t* p_old, Real_t* e_old, Real_t* q_old, Real_t* compression, + Real_t* compHalfStep, Real_t* vnewc, Real_t* work, Real_t* delvc, + Real_t pmin, Real_t p_cut, Real_t e_cut, Real_t q_cut, Real_t emin, + Real_t* qq_old, Real_t* ql_old, Real_t rho0, Real_t eosvmax, + Index_t length, Index_t* regElemList) +{ + Kokkos::parallel_for("CalcEnergyForElems", length, KOKKOS_LAMBDA(const int i) { + const Real_t delvc_i = delvc[i]; + const Real_t p_old_i = p_old[i]; + const Real_t q_old_i = q_old[i]; + Real_t e_new_i = e_old[i] - Real_t(0.5) * delvc_i * (p_old_i + q_old_i) + + Real_t(0.5) * work[i]; + + if(e_new_i < emin) + { + e_new_i = emin; + } + + Real_t bvc_i, pbvc_i; + Real_t pHalfStep_i; + const Real_t vnewc_e = vnewc[regElemList[i]]; + const Real_t compHalfStep_i = compHalfStep[i]; + CalcPressureForElem(pHalfStep_i, bvc_i, pbvc_i, e_new_i, compHalfStep_i, vnewc_e, + pmin, p_cut, eosvmax); + + Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep_i); + + Real_t q_new_i; + const Real_t ql_old_i = ql_old[i]; + const Real_t qq_old_i = qq_old[i]; + if(delvc_i > Real_t(0.)) + { + q_new_i /* = qq_old[i] = ql_old[i] */ = Real_t(0.); + } + else + { + Real_t ssc = (pbvc_i * e_new_i + vhalf * vhalf * bvc_i * pHalfStep_i) / rho0; + + if(ssc <= Real_t(.1111111e-36)) + { + ssc = Real_t(.3333333e-18); + } + else + { + ssc = SQRT(ssc); + } + + q_new_i = (ssc * ql_old_i + qq_old_i); + } + + e_new_i = e_new_i + Real_t(0.5) * delvc_i * + (Real_t(3.0) * (p_old_i + q_old_i) - + Real_t(4.0) * (pHalfStep_i + q_new_i)); + + e_new_i += Real_t(0.5) * work[i]; + + if(FABS(e_new_i) < e_cut) + { + e_new_i = Real_t(0.); + } + if(e_new_i < emin) + { + e_new_i = emin; + } + Real_t p_new_i; + const Real_t compression_i = compression[i]; + CalcPressureForElem(p_new_i, bvc_i, pbvc_i, e_new_i, compression_i, vnewc_e, pmin, + p_cut, eosvmax); + + const Real_t sixth = Real_t(1.0) / Real_t(6.0); + Real_t q_tilde; + + if(delvc_i > Real_t(0.)) + { + q_tilde = Real_t(0.); + } + else + { + Real_t ssc = (pbvc_i * e_new_i + vnewc_e * vnewc_e * bvc_i * p_new_i) / rho0; + + if(ssc <= Real_t(.1111111e-36)) + { + ssc = Real_t(.3333333e-18); + } + else + { + ssc = SQRT(ssc); + } + + q_tilde = (ssc * ql_old_i + qq_old_i); + } + + e_new_i = + e_new_i - (Real_t(7.0) * (p_old_i + q_old_i) - + Real_t(8.0) * (pHalfStep_i + q_new_i) + (p_new_i + q_tilde)) * + delvc_i * sixth; + + if(FABS(e_new_i) < e_cut) + { + e_new_i = Real_t(0.); + } + if(e_new_i < emin) + { + e_new_i = emin; + } + + CalcPressureForElem(p_new_i, bvc_i, pbvc_i, e_new_i, compression_i, vnewc_e, pmin, + p_cut, eosvmax); + bvc[i] = bvc_i; + pbvc[i] = pbvc_i; + p_new[i] = p_new_i; + + if(delvc_i <= Real_t(0.)) + { + Real_t ssc = (pbvc_i * e_new_i + vnewc_e * vnewc_e * bvc_i * p_new_i) / rho0; + + if(ssc <= Real_t(.1111111e-36)) + { + ssc = Real_t(.3333333e-18); + } + else + { + ssc = SQRT(ssc); + } + + q_new_i = (ssc * ql_old_i + qq_old_i); + + if(FABS(q_new_i) < q_cut) + q_new_i = Real_t(0.); + } + q_new[i] = q_new_i; + e_new[i] = e_new_i; + }); + + return; +} + +static inline void +CalcSoundSpeedForElems(Domain& domain, Real_t* vnewc, Real_t rho0, Real_t* enewc, + Real_t* pnewc, Real_t* pbvc, Real_t* bvc, Real_t ss4o3, + Index_t len, Index_t* regElemList) +{ + Kokkos::parallel_for("CalcSoundSpeedForElems", len, KOKKOS_LAMBDA(const int i) { + Index_t ielem = regElemList[i]; + Real_t ssTmp = + (pbvc[i] * enewc[i] + vnewc[ielem] * vnewc[ielem] * bvc[i] * pnewc[i]) / rho0; + if(ssTmp <= Real_t(.1111111e-36)) + { + ssTmp = Real_t(.3333333e-18); + } + else + { + ssTmp = SQRT(ssTmp); + } + domain.ss(ielem) = ssTmp; + }); +} + +static inline void +EvalEOSForElems(Domain& domain, Real_t* vnewc, Int_t numElemReg, Index_t* regElemList, + Int_t rep) +{ + Real_t e_cut = domain.e_cut(); + Real_t p_cut = domain.p_cut(); + Real_t ss4o3 = domain.ss4o3(); + Real_t q_cut = domain.q_cut(); + + Real_t eosvmax = domain.eosvmax(); + Real_t eosvmin = domain.eosvmin(); + Real_t pmin = domain.pmin(); + Real_t emin = domain.emin(); + Real_t rho0 = domain.refdens(); + + ResizeBuffer((numElemReg * sizeof(Real_t) + 4096) * 16); + + Real_t* e_old = AllocateFromBuffer(numElemReg); + Real_t* delvc = AllocateFromBuffer(numElemReg); + Real_t* p_old = AllocateFromBuffer(numElemReg); + Real_t* q_old = AllocateFromBuffer(numElemReg); + Real_t* compression = AllocateFromBuffer(numElemReg); + Real_t* compHalfStep = AllocateFromBuffer(numElemReg); + Real_t* qq_old = AllocateFromBuffer(numElemReg); + Real_t* ql_old = AllocateFromBuffer(numElemReg); + Real_t* work = AllocateFromBuffer(numElemReg); + Real_t* p_new = AllocateFromBuffer(numElemReg); + Real_t* e_new = AllocateFromBuffer(numElemReg); + Real_t* q_new = AllocateFromBuffer(numElemReg); + Real_t* bvc = AllocateFromBuffer(numElemReg); + Real_t* pbvc = AllocateFromBuffer(numElemReg); + + for(Int_t j = 0; j < rep; j++) + { + Kokkos::parallel_for("EvalEOSForElems A", numElemReg, KOKKOS_LAMBDA(const int i) { + Index_t ielem = regElemList[i]; + e_old[i] = domain.c_e(ielem); + delvc[i] = domain.c_delv(ielem); + p_old[i] = domain.c_p(ielem); + q_old[i] = domain.c_q(ielem); + qq_old[i] = domain.c_qq(ielem); + ql_old[i] = domain.c_ql(ielem); + const Real_t vnewc_ielem = vnewc[ielem]; + Real_t vchalf; + compression[i] = Real_t(1.) / vnewc_ielem - Real_t(1.); + vchalf = vnewc_ielem - delvc[i] * Real_t(.5); + compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.); + + if(eosvmin != Real_t(0.)) + { + if(vnewc_ielem <= eosvmin) + { /* impossible due to calling func? */ + compHalfStep[i] = compression[i]; + } + } + if(eosvmax != Real_t(0.)) + { + if(vnewc_ielem >= eosvmax) + { /* impossible due to calling func? */ + p_old[i] = Real_t(0.); + compression[i] = Real_t(0.); + compHalfStep[i] = Real_t(0.); + } + } + work[i] = Real_t(0.); + }); + + CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc, p_old, e_old, q_old, + compression, compHalfStep, vnewc, work, delvc, pmin, p_cut, + e_cut, q_cut, emin, qq_old, ql_old, rho0, eosvmax, numElemReg, + regElemList); + } + + Kokkos::parallel_for("EvalEOSForElems F", numElemReg, KOKKOS_LAMBDA(const int i) { + Index_t ielem = regElemList[i]; + domain.p(ielem) = p_new[i]; + domain.e(ielem) = e_new[i]; + domain.q(ielem) = q_new[i]; + }); + + CalcSoundSpeedForElems(domain, vnewc, rho0, e_new, p_new, pbvc, bvc, ss4o3, + numElemReg, regElemList); +} + +static inline void +ApplyMaterialPropertiesForElems(Domain& domain) +{ + Index_t numElem = domain.numElem(); + + if(numElem != 0) + { + Real_t eosvmin = domain.eosvmin(); + Real_t eosvmax = domain.eosvmax(); + Real_t* vnewc = Allocate(numElem); + + Kokkos::parallel_for("ApplyMaterialPropertiesForElems A", numElem, + KOKKOS_LAMBDA(const int i) { vnewc[i] = domain.vnew(i); }); + + if(eosvmin != Real_t(0.)) + { + Kokkos::parallel_for("ApplyMaterialPropertiesForElems B", numElem, + KOKKOS_LAMBDA(const int i) { + if(vnewc[i] < eosvmin) + vnewc[i] = eosvmin; + }); + } + + if(eosvmax != Real_t(0.)) + { + Kokkos::parallel_for("ApplyMaterialPropertiesForElems C", numElem, + KOKKOS_LAMBDA(const int i) { + if(vnewc[i] > eosvmax) + vnewc[i] = eosvmax; + }); + } + + int error = 0; + Kokkos::parallel_reduce(numElem, + KOKKOS_LAMBDA(const int i, int& err) { + Real_t vc = domain.v(i); + if(eosvmin != Real_t(0.)) + { + if(vc < eosvmin) + vc = eosvmin; + } + if(eosvmax != Real_t(0.)) + { + if(vc > eosvmax) + vc = eosvmax; + } + if(vc <= 0.) + { + err++; + } + }, + error); + + if(error) +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, VolumeError); +#else + exit(VolumeError); +#endif + + for(Int_t r = 0; r < domain.numReg(); r++) + { + Index_t numElemReg = domain.regElemSize(r); + Index_t* regElemList = domain.regElemlist(r); + Int_t rep; + if(r < domain.numReg() / 2) + rep = 1; + else if(r < (domain.numReg() - (domain.numReg() + 15) / 20)) + rep = 1 + domain.cost(); + else + rep = 10 * (1 + domain.cost()); + EvalEOSForElems(domain, vnewc, numElemReg, regElemList, rep); + } + + Release(&vnewc); + } +} + +static inline void +UpdateVolumesForElems(Domain& domain, Real_t v_cut, Index_t length) +{ + if(length != 0) + { + Kokkos::parallel_for("UpdateVolumesForElems", length, KOKKOS_LAMBDA(const int i) { + Real_t tmpV = domain.vnew(i); + + if(FABS(tmpV - Real_t(1.0)) < v_cut) + tmpV = Real_t(1.0); + + domain.v(i) = tmpV; + }); + } + + return; +} + +static inline void +LagrangeElements(Domain& domain, Index_t numElem) +{ + CalcLagrangeElements(domain); + + CalcQForElems(domain); + + ApplyMaterialPropertiesForElems(domain); + + UpdateVolumesForElems(domain, domain.v_cut(), numElem); +} + +static inline void +CalcCourantConstraintForElems(Domain& domain, Index_t length, Index_t* regElemlist, + Real_t qqc, Real_t& dtcourant) +{ + typedef Kokkos::View view_real_t; + + Real_t qqc2 = Real_t(64.0) * qqc * qqc; + Real_t dtcourant_tmp = dtcourant; + Index_t courant_elem = -1; + + MinFinder result; + + Kokkos::parallel_reduce(length, + KOKKOS_LAMBDA(const int i, MinFinder& minf) { + Index_t indx = regElemlist[i]; + Real_t dtf = domain.ss(indx) * domain.ss(indx); + + if(domain.vdov(indx) < Real_t(0.)) + { + dtf = dtf + qqc2 * domain.arealg(indx) * + domain.arealg(indx) * + domain.vdov(indx) * domain.vdov(indx); + } + + dtf = SQRT(dtf); + dtf = domain.arealg(indx) / dtf; + + MinFinder tmp(dtf, i); + if(domain.vdov(indx) != Real_t(0.)) + { + minf += tmp; + } + }, + result); + + dtcourant_tmp = result.val; + + if(dtcourant_tmp > dtcourant) + { + dtcourant_tmp = dtcourant; + } + + courant_elem = result.i; + + if(courant_elem != -1) + { + dtcourant = dtcourant_tmp; + } + + return; +} + +static inline void +CalcHydroConstraintForElems(Domain& domain, Index_t length, Index_t* regElemlist, + Real_t dvovmax, Real_t& dthydro) +{ + typedef Kokkos::View view_real_t; + + Real_t dthydro_tmp = dthydro; + Index_t hydro_elem = -1; + MinFinder result; + + Kokkos::parallel_reduce(length, + KOKKOS_LAMBDA(const int i, MinFinder& minf) { + Index_t indx = regElemlist[i]; + + if(domain.vdov(indx) != Real_t(0.)) + { + Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx)) + + Real_t(1.e-20)); + + MinFinder tmp(dtdvov, i); + if(domain.vdov(indx) != Real_t(0.)) + { + minf += tmp; + } + } + }, + result); + + if(result.val > dthydro) + { + result.val = dthydro; + } + + if(result.i != -1) + { + dthydro = result.val; + } + + return; +} + +static inline void +CalcTimeConstraintsForElems(Domain& domain) +{ + domain.dtcourant() = 1.0e+20; + domain.dthydro() = 1.0e+20; + + for(Index_t r = 0; r < domain.numReg(); ++r) + { + CalcCourantConstraintForElems(domain, domain.regElemSize(r), + domain.regElemlist(r), domain.qqc(), + domain.dtcourant()); + + CalcHydroConstraintForElems(domain, domain.regElemSize(r), domain.regElemlist(r), + domain.dvovmax(), domain.dthydro()); + } +} + +static inline void +LagrangeLeapFrog(Domain& domain) +{ +#ifdef SEDOV_SYNC_POS_VEL_LATE + Domain_member fieldData[6]; +#endif + LagrangeNodal(domain); + +#ifdef SEDOV_SYNC_POS_VEL_LATE +#endif + LagrangeElements(domain, domain.numElem()); + +#if USE_MPI +# ifdef SEDOV_SYNC_POS_VEL_LATE + CommRecv(domain, MSG_SYNC_POS_VEL, 6, domain.sizeX() + 1, domain.sizeY() + 1, + domain.sizeZ() + 1, false, false); + + fieldData[0] = &Domain::x; + fieldData[1] = &Domain::y; + fieldData[2] = &Domain::z; + fieldData[3] = &Domain::xd; + fieldData[4] = &Domain::yd; + fieldData[5] = &Domain::zd; + + CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData, domain.sizeX() + 1, + domain.sizeY() + 1, domain.sizeZ() + 1, false, false); +# endif +#endif + + CalcTimeConstraintsForElems(domain); + +#if USE_MPI +# ifdef SEDOV_SYNC_POS_VEL_LATE + CommSyncPosVel(domain); +# endif +#endif +} + +int +main(int argc, char* argv[]) +{ + Int_t numRanks; + Int_t myRank; + struct cmdLineOpts opts; + +#if USE_MPI + Domain_member fieldData; + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &numRanks); + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); +#else + numRanks = 1; + myRank = 0; +#endif + + Kokkos::initialize(); + { + opts.its = 9999999; + opts.nx = 30; + opts.numReg = 11; + opts.numFiles = (int) (numRanks + 10) / 9; + opts.showProg = 0; + opts.quiet = 0; + opts.viz = 0; + opts.balance = 1; + opts.cost = 1; + opts.do_atomic = 0; + + ParseCommandLineOptions(argc, argv, myRank, &opts); + + if(opts.do_atomic == 1) + do_atomic = 1; + else + do_atomic = 0; + + if((myRank == 0) && (opts.quiet == 0)) + { + printf("Running problem size %d^3 per domain until completion\n", opts.nx); + printf("Num processors: %d\n", numRanks); +#if _OPENMP + printf("Num threads: %d\n", omp_get_max_threads()); +#endif + printf("Total number of elements: %lld\n\n", + (long long int) (numRanks * opts.nx * opts.nx * opts.nx)); + printf("To run other sizes, use -s .\n"); + printf("To run a fixed number of iterations, use -i .\n"); + printf("To run a more or less balanced region set, use -b .\n"); + printf("To change the relative costs of regions, use -c .\n"); + printf("To print out progress, use -p\n"); + printf("To write an output file for VisIt, use -v\n"); + printf("See help (-h) for more options\n\n"); + } + + Int_t col, row, plane, side; + InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side); + + // Build the main data structure and initialize it + Domain locDom(numRanks, col, row, plane, opts.nx, side, opts.numReg, opts.balance, + opts.cost); + +#if USE_MPI + fieldData = &Domain::nodalMass; + + // Initial domain boundary communication + CommRecv(locDom, MSG_COMM_SBN, 1, locDom.sizeX() + 1, locDom.sizeY() + 1, + locDom.sizeZ() + 1, true, false); + CommSend(locDom, MSG_COMM_SBN, 1, &fieldData, locDom.sizeX() + 1, + locDom.sizeY() + 1, locDom.sizeZ() + 1, true, false); + CommSBN(locDom, 1, &fieldData); + + // End initialization + MPI_Barrier(MPI_COMM_WORLD); +#endif + +#if USE_MPI + double start = MPI_Wtime(); +#else + timeval start; + gettimeofday(&start, NULL); +#endif + Kokkos::Tools::pushRegion("Time-Loop"); + while((locDom.time() < locDom.stoptime()) && (locDom.cycle() < opts.its)) + { + Kokkos::Tools::pushRegion("TimeIncrement"); + TimeIncrement(locDom); + Kokkos::Tools::popRegion(); + + Kokkos::Tools::pushRegion("LagrangeLeapFrog"); + LagrangeLeapFrog(locDom); + Kokkos::Tools::popRegion(); + + if((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) + { + printf("cycle = %d, time = %e, dt=%e\n", locDom.cycle(), + double(locDom.time()), double(locDom.deltatime())); + } + } + Kokkos::Tools::popRegion(); + + double elapsed_time; +#if USE_MPI + elapsed_time = MPI_Wtime() - start; +#else + timeval end; + gettimeofday(&end, NULL); + elapsed_time = (double) (end.tv_sec - start.tv_sec) + + ((double) (end.tv_usec - start.tv_usec)) / 1000000; +#endif + double elapsed_timeG; +#if USE_MPI + MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE, MPI_MAX, 0, + MPI_COMM_WORLD); +#else + elapsed_timeG = elapsed_time; +#endif + + if(opts.viz) + { + DumpToVisit(locDom, opts.numFiles, myRank, numRanks); + } + + if((myRank == 0) && (opts.quiet == 0)) + { + VerifyAndWriteFinalOutput(elapsed_timeG, locDom, opts.nx, numRanks); + } + + Release(&buffer); + } + Kokkos::finalize(); +#if USE_MPI + MPI_Finalize(); +#endif + + return 0; +} diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh.h b/projects/rocprofiler-systems/examples/lulesh/lulesh.h new file mode 100644 index 0000000000..6c616b7700 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh.h @@ -0,0 +1,836 @@ + +#if !defined(USE_MPI) +# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line" +#endif + +// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging +// used supports it (i.e. the _OPENMP symbol is defined) +#define USE_OMP 1 + +#if USE_MPI +# include + +/* + define one of these three symbols: + + SEDOV_SYNC_POS_VEL_NONE + SEDOV_SYNC_POS_VEL_EARLY + SEDOV_SYNC_POS_VEL_LATE +*/ + +# define SEDOV_SYNC_POS_VEL_EARLY 1 +#endif + +#include +#include + +#include +#include + +//************************************************** +// Allow flexibility for arithmetic representations +//************************************************** + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +// Precision specification +typedef float real4; +typedef double real8; +typedef long double real10; // 10 bytes on x86 + +typedef int Index_t; // array subscript and loop index +typedef real8 Real_t; // floating point representation +typedef int Int_t; // integer representation + +enum +{ + VolumeError = -1, + QStopError = -2 +}; + +inline real4 +SQRT(real4 arg) +{ + return sqrtf(arg); +} +inline real8 +SQRT(real8 arg) +{ + return sqrt(arg); +} +inline real10 +SQRT(real10 arg) +{ + return sqrtl(arg); +} + +inline real4 +CBRT(real4 arg) +{ + return cbrtf(arg); +} +inline real8 +CBRT(real8 arg) +{ + return cbrt(arg); +} +inline real10 +CBRT(real10 arg) +{ + return cbrtl(arg); +} + +inline real4 +FABS(real4 arg) +{ + return fabsf(arg); +} +inline real8 +FABS(real8 arg) +{ + return fabs(arg); +} +inline real10 +FABS(real10 arg) +{ + return fabsl(arg); +} + +// Stuff needed for boundary conditions +// 2 BCs on each of 6 hexahedral faces (12 bits) +#define XI_M 0x00007 +#define XI_M_SYMM 0x00001 +#define XI_M_FREE 0x00002 +#define XI_M_COMM 0x00004 + +#define XI_P 0x00038 +#define XI_P_SYMM 0x00008 +#define XI_P_FREE 0x00010 +#define XI_P_COMM 0x00020 + +#define ETA_M 0x001c0 +#define ETA_M_SYMM 0x00040 +#define ETA_M_FREE 0x00080 +#define ETA_M_COMM 0x00100 + +#define ETA_P 0x00e00 +#define ETA_P_SYMM 0x00200 +#define ETA_P_FREE 0x00400 +#define ETA_P_COMM 0x00800 + +#define ZETA_M 0x07000 +#define ZETA_M_SYMM 0x01000 +#define ZETA_M_FREE 0x02000 +#define ZETA_M_COMM 0x04000 + +#define ZETA_P 0x38000 +#define ZETA_P_SYMM 0x08000 +#define ZETA_P_FREE 0x10000 +#define ZETA_P_COMM 0x20000 + +// MPI Message Tags +#define MSG_COMM_SBN 1024 +#define MSG_SYNC_POS_VEL 2048 +#define MSG_MONOQ 3072 + +#define MAX_FIELDS_PER_MPI_COMM 6 + +// Assume 128 byte coherence +// Assume Real_t is an "integral power of 2" bytes wide +#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) + +#define CACHE_ALIGN_REAL(n) \ + (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL - 1)) + +////////////////////////////////////////////////////// +// Primary data structure +////////////////////////////////////////////////////// + +/* + * The implementation of the data abstraction used for lulesh + * resides entirely in the Domain class below. You can change + * grouping and interleaving of fields here to maximize data layout + * efficiency for your underlying architecture or compiler. + * + * For example, fields can be implemented as STL objects or + * raw array pointers. As another example, individual fields + * m_x, m_y, m_z could be budled into + * + * struct { Real_t x, y, z ; } *m_coord ; + * + * allowing accessor functions such as + * + * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" + * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" + * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" + */ + +class Domain +{ +public: + // Constructor + Domain(Int_t numRanks, Index_t colLoc, Index_t rowLoc, Index_t planeLoc, Index_t nx, + Int_t tp, Int_t nr, Int_t balance, Int_t cost); + + // Destructor + ~Domain(); + + // + // ALLOCATION + // + + void AllocateNodePersistent(Int_t numNode) // Node-centered + { + m_x.resize(numNode); // coordinates + m_y.resize(numNode); + m_z.resize(numNode); + + m_xd.resize(numNode); // velocities + m_yd.resize(numNode); + m_zd.resize(numNode); + + m_xdd.resize(numNode); // accelerations + m_ydd.resize(numNode); + m_zdd.resize(numNode); + + m_fx.resize(numNode); // forces + m_fy.resize(numNode); + m_fz.resize(numNode); + + m_nodalMass.resize(numNode); // mass + + m_c_x = m_x.d_view; + m_c_y = m_y.d_view; + m_c_z = m_z.d_view; + m_c_xd = m_xd.d_view; + m_c_yd = m_yd.d_view; + m_c_zd = m_zd.d_view; + } + + void AllocateElemPersistent(Int_t numElem) // Elem-centered + { + m_nodelist.resize(8 * numElem); + + // elem connectivities through face + m_lxim.resize(numElem); + m_lxip.resize(numElem); + m_letam.resize(numElem); + m_letap.resize(numElem); + m_lzetam.resize(numElem); + m_lzetap.resize(numElem); + + m_elemBC.resize(numElem); + + m_e.resize(numElem); + m_p.resize(numElem); + + m_q.resize(numElem); + m_ql.resize(numElem); + m_qq.resize(numElem); + + m_v.resize(numElem); + + m_volo.resize(numElem); + m_delv.resize(numElem); + m_vdov.resize(numElem); + + m_arealg.resize(numElem); + + m_ss.resize(numElem); + + m_elemMass.resize(numElem); + + m_vnew.resize(numElem); + + m_c_e = m_e.d_view; + m_c_p = m_p.d_view; + m_c_q = m_q.d_view; + m_c_ql = m_ql.d_view; + m_c_qq = m_qq.d_view; + m_c_delv = m_delv.d_view; + } + + void AllocateGradients(Int_t numElem, Int_t allElem) + { + // Position gradients + m_delx_xi.resize(numElem); + m_delx_eta.resize(numElem); + m_delx_zeta.resize(numElem); + + // Velocity gradients + m_delv_xi.resize(allElem); + m_delv_eta.resize(allElem); + m_delv_zeta.resize(allElem); + } + + void DeallocateGradients() + { + m_delx_zeta.clear(); + m_delx_eta.clear(); + m_delx_xi.clear(); + + m_delv_zeta.clear(); + m_delv_eta.clear(); + m_delv_xi.clear(); + } + + void AllocateStrains(Int_t numElem) + { + m_dxx.resize(numElem); + m_dyy.resize(numElem); + m_dzz.resize(numElem); + } + + void DeallocateStrains() + { + m_dzz.clear(); + m_dyy.clear(); + m_dxx.clear(); + } + + // + // ACCESSORS + // + + // Node-centered + + // Nodal coordinates + KOKKOS_INLINE_FUNCTION Real_t& x(const Index_t idx) const { return m_x[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& y(const Index_t idx) const { return m_y[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& z(const Index_t idx) const { return m_z[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_x(const Index_t idx) const { return m_c_x[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_y(const Index_t idx) const { return m_c_y[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_z(const Index_t idx) const { return m_c_z[idx]; } + + // Nodal velocities + KOKKOS_INLINE_FUNCTION Real_t& xd(const Index_t idx) const { return m_xd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& yd(const Index_t idx) const { return m_yd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& zd(const Index_t idx) const { return m_zd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_xd(const Index_t idx) const { return m_c_xd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_yd(const Index_t idx) const { return m_c_yd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_zd(const Index_t idx) const { return m_c_zd[idx]; } + + // Nodal accelerations + KOKKOS_INLINE_FUNCTION Real_t& xdd(const Index_t idx) const { return m_xdd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& ydd(const Index_t idx) const { return m_ydd[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& zdd(const Index_t idx) const { return m_zdd[idx]; } + + // Nodal forces + KOKKOS_INLINE_FUNCTION Real_t& fx(const Index_t idx) const { return m_fx[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& fy(const Index_t idx) const { return m_fy[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& fz(const Index_t idx) const { return m_fz[idx]; } + + // Nodal mass + KOKKOS_INLINE_FUNCTION Real_t& nodalMass(const Index_t idx) const + { + return m_nodalMass[idx]; + } + + // Nodes on symmertry planes + Index_t symmX(const Index_t idx) const { return m_symmX[idx]; } + Index_t symmY(const Index_t idx) const { return m_symmY[idx]; } + Index_t symmZ(const Index_t idx) const { return m_symmZ[idx]; } + bool symmXempty() { return m_symmX.empty(); } + bool symmYempty() { return m_symmY.empty(); } + bool symmZempty() { return m_symmZ.empty(); } + + // + // Element-centered + // + Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx]; } + Index_t& regNumList(Index_t idx) { return m_regNumList[idx]; } + Index_t* regNumList() { return &m_regNumList[0]; } + Index_t* regElemlist(Int_t r) { return m_regElemlist[r]; } + Index_t& regElemlist(const Int_t r, Index_t idx) const + { + return m_regElemlist[r][idx]; + } + + Index_t* nodelist(Index_t idx) const { return &m_nodelist[Index_t(8) * idx]; } + + // elem connectivities through face + Index_t& lxim(const Index_t idx) const { return m_lxim[idx]; } + Index_t& lxip(const Index_t idx) const { return m_lxip[idx]; } + Index_t& letam(const Index_t idx) const { return m_letam[idx]; } + Index_t& letap(const Index_t idx) const { return m_letap[idx]; } + Index_t& lzetam(const Index_t idx) const { return m_lzetam[idx]; } + Index_t& lzetap(const Index_t idx) const { return m_lzetap[idx]; } + + // elem face symm/free-surface flag + Int_t& elemBC(const Index_t idx) const { return m_elemBC[idx]; } + + // Principal strains - temporary + KOKKOS_INLINE_FUNCTION Real_t& dxx(const Index_t idx) const { return m_dxx[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& dyy(const Index_t idx) const { return m_dyy[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& dzz(const Index_t idx) const { return m_dzz[idx]; } + + // New relative volume - temporary + KOKKOS_INLINE_FUNCTION Real_t& vnew(const Index_t idx) const { return m_vnew[idx]; } + + // Velocity gradient - temporary + KOKKOS_INLINE_FUNCTION Real_t& delv_xi(const Index_t idx) const + { + return m_delv_xi[idx]; + } + KOKKOS_INLINE_FUNCTION Real_t& delv_eta(const Index_t idx) const + { + return m_delv_eta[idx]; + } + KOKKOS_INLINE_FUNCTION Real_t& delv_zeta(const Index_t idx) const + { + return m_delv_zeta[idx]; + } + + // Position gradient - temporary + KOKKOS_INLINE_FUNCTION Real_t& delx_xi(const Index_t idx) const + { + return m_delx_xi[idx]; + } + KOKKOS_INLINE_FUNCTION Real_t& delx_eta(const Index_t idx) const + { + return m_delx_eta[idx]; + } + KOKKOS_INLINE_FUNCTION Real_t& delx_zeta(const Index_t idx) const + { + return m_delx_zeta[idx]; + } + // Energy + KOKKOS_INLINE_FUNCTION Real_t& e(const Index_t idx) const { return m_e[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_e(const Index_t idx) const { return m_c_e[idx]; } + + // Pressure + KOKKOS_INLINE_FUNCTION Real_t& p(const Index_t idx) const { return m_p[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_p(const Index_t idx) const { return m_c_p[idx]; } + + // Artificial viscosity + KOKKOS_INLINE_FUNCTION Real_t& q(const Index_t idx) const { return m_q[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_q(const Index_t idx) const { return m_c_q[idx]; } + + // Linear term for q + KOKKOS_INLINE_FUNCTION Real_t& ql(const Index_t idx) const { return m_ql[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_ql(const Index_t idx) const { return m_c_ql[idx]; } + // Quadratic term for q + KOKKOS_INLINE_FUNCTION Real_t& qq(const Index_t idx) const { return m_qq[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_qq(const Index_t idx) const { return m_c_qq[idx]; } + + // Relative volume + KOKKOS_INLINE_FUNCTION Real_t& v(const Index_t idx) const { return m_v[idx]; } + KOKKOS_INLINE_FUNCTION Real_t& delv(const Index_t idx) const { return m_delv[idx]; } + KOKKOS_INLINE_FUNCTION Real_t c_delv(const Index_t idx) const + { + return m_c_delv[idx]; + } + + // Reference volume + KOKKOS_INLINE_FUNCTION Real_t& volo(Index_t idx) const { return m_volo[idx]; } + + // volume derivative over volume + KOKKOS_INLINE_FUNCTION Real_t& vdov(Index_t idx) const { return m_vdov[idx]; } + + // Element characteristic length + KOKKOS_INLINE_FUNCTION Real_t& arealg(Index_t idx) const { return m_arealg[idx]; } + + // Sound speed + KOKKOS_INLINE_FUNCTION Real_t& ss(const Index_t idx) const { return m_ss[idx]; } + + // Element mass + KOKKOS_INLINE_FUNCTION Real_t& elemMass(const Index_t idx) const + { + return m_elemMass[idx]; + } + + KOKKOS_INLINE_FUNCTION Index_t nodeElemCount(Index_t idx) const + { + return m_nodeElemStart[idx + 1] - m_nodeElemStart[idx]; + } + + KOKKOS_INLINE_FUNCTION Index_t* nodeElemCornerList(Index_t idx) const + { + return &m_nodeElemCornerList[m_nodeElemStart[idx]]; + } + + // Parameters + + // Cutoffs + KOKKOS_INLINE_FUNCTION Real_t u_cut() const { return m_u_cut; } + KOKKOS_INLINE_FUNCTION Real_t e_cut() const { return m_e_cut; } + KOKKOS_INLINE_FUNCTION Real_t p_cut() const { return m_p_cut; } + KOKKOS_INLINE_FUNCTION Real_t q_cut() const { return m_q_cut; } + KOKKOS_INLINE_FUNCTION Real_t v_cut() const { return m_v_cut; } + + // Other constants (usually are settable via input file in real codes) + KOKKOS_INLINE_FUNCTION Real_t hgcoef() const { return m_hgcoef; } + KOKKOS_INLINE_FUNCTION Real_t qstop() const { return m_qstop; } + KOKKOS_INLINE_FUNCTION Real_t monoq_max_slope() const { return m_monoq_max_slope; } + KOKKOS_INLINE_FUNCTION Real_t monoq_limiter_mult() const + { + return m_monoq_limiter_mult; + } + KOKKOS_INLINE_FUNCTION Real_t ss4o3() const { return m_ss4o3; } + KOKKOS_INLINE_FUNCTION Real_t qlc_monoq() const { return m_qlc_monoq; } + KOKKOS_INLINE_FUNCTION Real_t qqc_monoq() const { return m_qqc_monoq; } + KOKKOS_INLINE_FUNCTION Real_t qqc() const { return m_qqc; } + + KOKKOS_INLINE_FUNCTION Real_t eosvmax() const { return m_eosvmax; } + KOKKOS_INLINE_FUNCTION Real_t eosvmin() const { return m_eosvmin; } + KOKKOS_INLINE_FUNCTION Real_t pmin() const { return m_pmin; } + KOKKOS_INLINE_FUNCTION Real_t emin() const { return m_emin; } + KOKKOS_INLINE_FUNCTION Real_t dvovmax() const { return m_dvovmax; } + KOKKOS_INLINE_FUNCTION Real_t refdens() const { return m_refdens; } + + // Timestep controls, etc... + Real_t& time() { return m_time; } + Real_t& deltatime() { return m_deltatime; } + Real_t& deltatimemultlb() { return m_deltatimemultlb; } + Real_t& deltatimemultub() { return m_deltatimemultub; } + Real_t& stoptime() { return m_stoptime; } + Real_t& dtcourant() { return m_dtcourant; } + Real_t& dthydro() { return m_dthydro; } + Real_t& dtmax() { return m_dtmax; } + Real_t& dtfixed() { return m_dtfixed; } + + Int_t& cycle() { return m_cycle; } + Index_t& numRanks() { return m_numRanks; } + + Index_t& colLoc() { return m_colLoc; } + Index_t& rowLoc() { return m_rowLoc; } + Index_t& planeLoc() { return m_planeLoc; } + Index_t& tp() { return m_tp; } + + Index_t& sizeX() { return m_sizeX; } + Index_t& sizeY() { return m_sizeY; } + Index_t& sizeZ() { return m_sizeZ; } + Index_t& numReg() { return m_numReg; } + Int_t& cost() { return m_cost; } + Index_t& numElem() { return m_numElem; } + Index_t& numNode() { return m_numNode; } + + Index_t& maxPlaneSize() { return m_maxPlaneSize; } + Index_t& maxEdgeSize() { return m_maxEdgeSize; } + + // + // MPI-Related additional data + // + +#if USE_MPI + // Communication Work space + Real_t* commDataSend; + Real_t* commDataRecv; + + // Maximum number of block neighbors + MPI_Request recvRequest[26]; // 6 faces + 12 edges + 8 corners + MPI_Request sendRequest[26]; // 6 faces + 12 edges + 8 corners +#endif + +private: + void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); + void SetupThreadSupportStructures(); + void CreateRegionIndexSets(Int_t nreg, Int_t balance); + void SetupCommBuffers(Int_t edgeNodes); + void SetupSymmetryPlanes(Int_t edgeNodes); + void SetupElementConnectivities(Int_t edgeElems); + void SetupBoundaryConditions(Int_t edgeElems); + + // + // IMPLEMENTATION + // + + /* Node-centered */ + Kokkos::vector m_x; /* coordinates */ + Kokkos::vector m_y; + Kokkos::vector m_z; + Kokkos::View> + m_c_x; /* coordinates */ + Kokkos::View> + m_c_y; /* coordinates */ + Kokkos::View> + m_c_z; /* coordinates */ + + Kokkos::vector m_xd; /* velocities */ + Kokkos::vector m_yd; + Kokkos::vector m_zd; + Kokkos::View> + m_c_xd; /* coordinates */ + Kokkos::View> + m_c_yd; /* coordinates */ + Kokkos::View> + m_c_zd; /* coordinates */ + + Kokkos::vector m_xdd; /* accelerations */ + Kokkos::vector m_ydd; + Kokkos::vector m_zdd; + + Kokkos::vector m_fx; /* forces */ + Kokkos::vector m_fy; + Kokkos::vector m_fz; + + Kokkos::vector m_nodalMass; /* mass */ + + Kokkos::vector m_symmX; /* symmetry plane nodesets */ + Kokkos::vector m_symmY; + Kokkos::vector m_symmZ; + + // Element-centered + + // Region information + Int_t m_numReg; + Int_t m_cost; // imbalance cost + Index_t* m_regElemSize; // Size of region sets + Index_t* m_regNumList; // Region number per domain element + Index_t** m_regElemlist; // region indexset + + Kokkos::vector m_nodelist; /* elemToNode connectivity */ + + Kokkos::vector m_lxim; /* element connectivity across each face */ + Kokkos::vector m_lxip; + Kokkos::vector m_letam; + Kokkos::vector m_letap; + Kokkos::vector m_lzetam; + Kokkos::vector m_lzetap; + + Kokkos::vector m_elemBC; /* symmetry/free-surface flags for each elem face */ + + Kokkos::vector m_dxx; /* principal strains -- temporary */ + Kokkos::vector m_dyy; + Kokkos::vector m_dzz; + + Kokkos::vector m_delv_xi; /* velocity gradient -- temporary */ + Kokkos::vector m_delv_eta; + Kokkos::vector m_delv_zeta; + + Kokkos::vector m_delx_xi; /* coordinate gradient -- temporary */ + Kokkos::vector m_delx_eta; + Kokkos::vector m_delx_zeta; + + Kokkos::vector m_e; /* energy */ + + Kokkos::vector m_p; /* pressure */ + Kokkos::vector m_q; /* q */ + Kokkos::vector m_ql; /* linear term for q */ + Kokkos::vector m_qq; /* quadratic term for q */ + + Kokkos::vector m_v; /* relative volume */ + Kokkos::vector m_volo; /* reference volume */ + Kokkos::vector m_vnew; /* new relative volume -- temporary */ + Kokkos::vector m_delv; /* m_vnew - m_v */ + Kokkos::vector m_vdov; /* volume derivative over volume */ + + Kokkos::View> + m_c_e; /* coordinates */ + Kokkos::View> + m_c_p; /* coordinates */ + Kokkos::View> + m_c_q; /* coordinates */ + Kokkos::View> + m_c_ql; /* coordinates */ + Kokkos::View> + m_c_qq; /* coordinates */ + Kokkos::View> + m_c_delv; /* coordinates */ + + Kokkos::vector m_arealg; /* characteristic length of an element */ + + Kokkos::vector m_ss; /* "sound speed" */ + + Kokkos::vector m_elemMass; /* mass */ + + // Cutoffs (treat as constants) + const Real_t m_e_cut; // energy tolerance + const Real_t m_p_cut; // pressure tolerance + const Real_t m_q_cut; // q tolerance + const Real_t m_v_cut; // relative volume tolerance + const Real_t m_u_cut; // velocity tolerance + + // Other constants (usually setable, but hardcoded in this proxy app) + + const Real_t m_hgcoef; // hourglass control + const Real_t m_ss4o3; + const Real_t m_qstop; // excessive q indicator + const Real_t m_monoq_max_slope; + const Real_t m_monoq_limiter_mult; + const Real_t m_qlc_monoq; // linear term coef for q + const Real_t m_qqc_monoq; // quadratic term coef for q + const Real_t m_qqc; + const Real_t m_eosvmax; + const Real_t m_eosvmin; + const Real_t m_pmin; // pressure floor + const Real_t m_emin; // energy floor + const Real_t m_dvovmax; // maximum allowable volume change + const Real_t m_refdens; // reference density + + // Variables to keep track of timestep, simulation time, and cycle + Real_t m_dtcourant; // courant constraint + Real_t m_dthydro; // volume change constraint + Int_t m_cycle; // iteration count for simulation + Real_t m_dtfixed; // fixed time increment + Real_t m_time; // current time + Real_t m_deltatime; // variable time increment + Real_t m_deltatimemultlb; + Real_t m_deltatimemultub; + Real_t m_dtmax; // maximum allowable time increment + Real_t m_stoptime; // end time for simulation + + Int_t m_numRanks; + + Index_t m_colLoc; + Index_t m_rowLoc; + Index_t m_planeLoc; + Index_t m_tp; + + Index_t m_sizeX; + Index_t m_sizeY; + Index_t m_sizeZ; + Index_t m_numElem; + Index_t m_numNode; + + Index_t m_maxPlaneSize; + Index_t m_maxEdgeSize; + + // OMP hack + Index_t* m_nodeElemStart; + Index_t* m_nodeElemCornerList; + + // Used in setup + Index_t m_rowMin, m_rowMax; + Index_t m_colMin, m_colMax; + Index_t m_planeMin, m_planeMax; +}; +typedef Real_t& (Domain::*Domain_member)(Index_t) const; + +struct cmdLineOpts +{ + Int_t its; // -i + Int_t nx; // -s + Int_t numReg; // -r + Int_t numFiles; // -f + Int_t showProg; // -p + Int_t quiet; // -q + Int_t viz; // -v + Int_t cost; // -c + Int_t balance; // -b + Int_t do_atomic; // -a +}; + +// Function Prototypes + +// lulesh-par +/*Real_t CalcElemVolume( const Real_t x[8], + const Real_t y[8], + const Real_t z[8]);*/ + +// lulesh-util +void +ParseCommandLineOptions(int argc, char* argv[], Int_t myRank, struct cmdLineOpts* opts); +void +VerifyAndWriteFinalOutput(Real_t elapsed_time, Domain& locDom, Int_t nx, Int_t numRanks); + +// lulesh-viz +void +DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); + +// lulesh-comm +void +CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, Index_t dx, Index_t dy, + Index_t dz, bool doRecv, bool planeOnly); +void +CommSend(Domain& domain, Int_t msgType, Index_t xferFields, Domain_member* fieldData, + Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly); +void +CommSBN(Domain& domain, Int_t xferFields, Domain_member* fieldData); +void +CommSyncPosVel(Domain& domain); +void +CommMonoQ(Domain& domain); + +// lulesh-init +void +InitMeshDecomp(Int_t numRanks, Int_t myRank, Int_t* col, Int_t* row, Int_t* plane, + Int_t* side); + +/*********************************/ +/* Data structure implementation */ +/*********************************/ + +/* might want to add access methods so that memory can be */ +/* better managed, as in luleshFT */ + +template +T* +Allocate(size_t size) +{ + return static_cast(Kokkos::kokkos_malloc<>(sizeof(T) * size)); +} + +template +void +Release(T** ptr) +{ + if(*ptr != NULL) + { + Kokkos::kokkos_free<>(*ptr); + *ptr = NULL; + } +} + +struct MinFinder +{ + Real_t val; + int i; + KOKKOS_INLINE_FUNCTION + + MinFinder() + : val(100000000000000000000.0000) + , i(-1) + {} + + KOKKOS_INLINE_FUNCTION + MinFinder(const double& val_, const int& i_) + : val(val_) + , i(i_) + {} + + KOKKOS_INLINE_FUNCTION + MinFinder(const MinFinder& src) + : val(src.val) + , i(src.i) + {} + + // overloading += operator to do the max assignment + KOKKOS_INLINE_FUNCTION + void operator+=(MinFinder& src) + { + if(src.val < val) + { + val = src.val; + i = src.i; + } + } + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile MinFinder& src) volatile + { + if(src.val < val) + { + val = src.val; + i = src.i; + } + } +}; + +struct reduce_double3 +{ + double x, y, z; + KOKKOS_INLINE_FUNCTION + reduce_double3() + { + x = 0.0; + y = 0.0; + z = 0.0; + } + KOKKOS_INLINE_FUNCTION + void operator+=(const reduce_double3& src) + { + x += src.x; + y += src.y; + z += src.z; + } +}; diff --git a/projects/rocprofiler-systems/examples/lulesh/lulesh_tuple.h b/projects/rocprofiler-systems/examples/lulesh/lulesh_tuple.h new file mode 100644 index 0000000000..e7273af967 --- /dev/null +++ b/projects/rocprofiler-systems/examples/lulesh/lulesh_tuple.h @@ -0,0 +1,651 @@ +#if !defined(USE_MPI) +# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line" +#endif + +// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging +// used supports it (i.e. the _OPENMP symbol is defined) +#define USE_OMP 1 + +#if USE_MPI +# include +#endif + +#include + +/* + define one of these three symbols: + + SEDOV_SYNC_POS_VEL_NONE + SEDOV_SYNC_POS_VEL_EARLY + SEDOV_SYNC_POS_VEL_LATE +*/ + +#define SEDOV_SYNC_POS_VEL_EARLY 1 + +#include +#include + +//************************************************** +// Allow flexibility for arithmetic representations +//************************************************** + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +// Precision specification +typedef float real4; +typedef double real8; +typedef long double real10; // 10 bytes on x86 + +typedef int Index_t; // array subscript and loop index +typedef real8 Real_t; // floating point representation +typedef int Int_t; // integer representation + +enum +{ + VolumeError = -1, + QStopError = -2 +}; + +inline real4 +SQRT(real4 arg) +{ + return sqrtf(arg); +} +inline real8 +SQRT(real8 arg) +{ + return sqrt(arg); +} +inline real10 +SQRT(real10 arg) +{ + return sqrtl(arg); +} + +inline real4 +CBRT(real4 arg) +{ + return cbrtf(arg); +} +inline real8 +CBRT(real8 arg) +{ + return cbrt(arg); +} +inline real10 +CBRT(real10 arg) +{ + return cbrtl(arg); +} + +inline real4 +FABS(real4 arg) +{ + return fabsf(arg); +} +inline real8 +FABS(real8 arg) +{ + return fabs(arg); +} +inline real10 +FABS(real10 arg) +{ + return fabsl(arg); +} + +// Stuff needed for boundary conditions +// 2 BCs on each of 6 hexahedral faces (12 bits) +#define XI_M 0x00007 +#define XI_M_SYMM 0x00001 +#define XI_M_FREE 0x00002 +#define XI_M_COMM 0x00004 + +#define XI_P 0x00038 +#define XI_P_SYMM 0x00008 +#define XI_P_FREE 0x00010 +#define XI_P_COMM 0x00020 + +#define ETA_M 0x001c0 +#define ETA_M_SYMM 0x00040 +#define ETA_M_FREE 0x00080 +#define ETA_M_COMM 0x00100 + +#define ETA_P 0x00e00 +#define ETA_P_SYMM 0x00200 +#define ETA_P_FREE 0x00400 +#define ETA_P_COMM 0x00800 + +#define ZETA_M 0x07000 +#define ZETA_M_SYMM 0x01000 +#define ZETA_M_FREE 0x02000 +#define ZETA_M_COMM 0x04000 + +#define ZETA_P 0x38000 +#define ZETA_P_SYMM 0x08000 +#define ZETA_P_FREE 0x10000 +#define ZETA_P_COMM 0x20000 + +// MPI Message Tags +#define MSG_COMM_SBN 1024 +#define MSG_SYNC_POS_VEL 2048 +#define MSG_MONOQ 3072 + +#define MAX_FIELDS_PER_MPI_COMM 6 + +// Assume 128 byte coherence +// Assume Real_t is an "integral power of 2" bytes wide +#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) + +#define CACHE_ALIGN_REAL(n) \ + (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL - 1)) + +////////////////////////////////////////////////////// +// Primary data structure +////////////////////////////////////////////////////// + +/* + * The implementation of the data abstraction used for lulesh + * resides entirely in the Domain class below. You can change + * grouping and interleaving of fields here to maximize data layout + * efficiency for your underlying architecture or compiler. + * + * For example, fields can be implemented as STL objects or + * raw array pointers. As another example, individual fields + * m_x, m_y, m_z could be budled into + * + * struct { Real_t x, y, z ; } *m_coord ; + * + * allowing accessor functions such as + * + * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" + * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" + * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" + */ + +class Domain +{ +public: + // Constructor + Domain(Int_t numRanks, Index_t colLoc, Index_t rowLoc, Index_t planeLoc, Index_t nx, + Int_t tp, Int_t nr, Int_t balance, Int_t cost); + + // + // ALLOCATION + // + + void AllocateNodePersistent(Int_t numNode) // Node-centered + { + m_coord.resize(numNode); // coordinates + + m_vel.resize(numNode); // velocities + + m_acc.resize(numNode); // accelerations + + m_force.resize(numNode); // forces + + m_nodalMass.resize(numNode); // mass + } + + void AllocateElemPersistent(Int_t numElem) // Elem-centered + { + m_nodelist.resize(8 * numElem); + + // elem connectivities through face + m_faceToElem.resize(numElem); + + m_elemBC.resize(numElem); + + m_e.resize(numElem); + + m_pq.resize(numElem); + + m_qlqq.resize(numElem); + + m_vol.resize(numElem); + + m_delv.resize(numElem); + m_vdov.resize(numElem); + + m_arealg.resize(numElem); + + m_ss.resize(numElem); + + m_elemMass.resize(numElem); + } + + void AllocateGradients(Int_t numElem, Int_t allElem) + { + // Position gradients + m_delx_xi.resize(numElem); + m_delx_eta.resize(numElem); + m_delx_zeta.resize(numElem); + + // Velocity gradients + m_delv_xi.resize(allElem); + m_delv_eta.resize(allElem); + m_delv_zeta.resize(allElem); + } + + void DeallocateGradients() + { + m_delx_zeta.clear(); + m_delx_eta.clear(); + m_delx_xi.clear(); + + m_delv_zeta.clear(); + m_delv_eta.clear(); + m_delv_xi.clear(); + } + + void AllocateStrains(Int_t numElem) + { + m_dxx.resize(numElem); + m_dyy.resize(numElem); + m_dzz.resize(numElem); + } + + void DeallocateStrains() + { + m_dzz.clear(); + m_dyy.clear(); + m_dxx.clear(); + } + + // + // ACCESSORS + // + + // Node-centered + + // Nodal coordinates + Real_t& x(Index_t idx) { return m_coord[idx].x; } + Real_t& y(Index_t idx) { return m_coord[idx].y; } + Real_t& z(Index_t idx) { return m_coord[idx].z; } + + // Nodal velocities + Real_t& xd(Index_t idx) { return m_vel[idx].x; } + Real_t& yd(Index_t idx) { return m_vel[idx].y; } + Real_t& zd(Index_t idx) { return m_vel[idx].z; } + + // Nodal accelerations + Real_t& xdd(Index_t idx) { return m_acc[idx].x; } + Real_t& ydd(Index_t idx) { return m_acc[idx].y; } + Real_t& zdd(Index_t idx) { return m_acc[idx].z; } + + // Nodal forces + Real_t& fx(Index_t idx) { return m_force[idx].x; } + Real_t& fy(Index_t idx) { return m_force[idx].y; } + Real_t& fz(Index_t idx) { return m_force[idx].z; } + + // Nodal mass + Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx]; } + + // Nodes on symmertry planes + Index_t symmX(Index_t idx) { return m_symmX[idx]; } + Index_t symmY(Index_t idx) { return m_symmY[idx]; } + Index_t symmZ(Index_t idx) { return m_symmZ[idx]; } + bool symmXempty() { return m_symmX.empty(); } + bool symmYempty() { return m_symmY.empty(); } + bool symmZempty() { return m_symmZ.empty(); } + + // + // Element-centered + // + Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx]; } + Index_t& regNumList(Index_t idx) { return m_regNumList[idx]; } + Index_t* regNumList() { return &m_regNumList[0]; } + Index_t* regElemlist(Int_t r) { return m_regElemlist[r]; } + Index_t& regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx]; } + + Index_t* nodelist(Index_t idx) { return &m_nodelist[Index_t(8) * idx]; } + + // elem connectivities through face + Index_t& lxim(Index_t idx) { return m_faceToElem[idx].lxim; } + Index_t& lxip(Index_t idx) { return m_faceToElem[idx].lxip; } + Index_t& letam(Index_t idx) { return m_faceToElem[idx].letam; } + Index_t& letap(Index_t idx) { return m_faceToElem[idx].letap; } + Index_t& lzetam(Index_t idx) { return m_faceToElem[idx].lzetam; } + Index_t& lzetap(Index_t idx) { return m_faceToElem[idx].lzetap; } + + // elem face symm/free-surface flag + Int_t& elemBC(Index_t idx) { return m_elemBC[idx]; } + + // Principal strains - temporary + Real_t& dxx(Index_t idx) { return m_dxx[idx]; } + Real_t& dyy(Index_t idx) { return m_dyy[idx]; } + Real_t& dzz(Index_t idx) { return m_dzz[idx]; } + + // Velocity gradient - temporary + Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx]; } + Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx]; } + Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx]; } + + // Position gradient - temporary + Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx]; } + Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx]; } + Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx]; } + + // Energy + Real_t& e(Index_t idx) { return m_e[idx]; } + + // Pressure + Real_t& p(Index_t idx) { return m_pq[idx].p; } + + // Artificial viscosity + Real_t& q(Index_t idx) { return m_pq[idx].q; } + + // Linear term for q + Real_t& ql(Index_t idx) { return m_qlqq[idx].ql; } + // Quadratic term for q + Real_t& qq(Index_t idx) { return m_qlqq[idx].qq; } + + Real_t& delv(Index_t idx) { return m_delv[idx]; } + + // Relative volume + Real_t& v(Index_t idx) { return m_vol[idx].v; } + // Reference volume + Real_t& volo(Index_t idx) { return m_vol[idx].volo; } + + // volume derivative over volume + Real_t& vdov(Index_t idx) { return m_vdov[idx]; } + + // Element characteristic length + Real_t& arealg(Index_t idx) { return m_arealg[idx]; } + + // Sound speed + Real_t& ss(Index_t idx) { return m_ss[idx]; } + + // Element mass + Real_t& elemMass(Index_t idx) { return m_elemMass[idx]; } + + Index_t nodeElemCount(Index_t idx) + { + return m_nodeElemStart[idx + 1] - m_nodeElemStart[idx]; + } + + Index_t* nodeElemCornerList(Index_t idx) + { + return &m_nodeElemCornerList[m_nodeElemStart[idx]]; + } + + // Parameters + + // Cutoffs + Real_t u_cut() const { return m_u_cut; } + Real_t e_cut() const { return m_e_cut; } + Real_t p_cut() const { return m_p_cut; } + Real_t q_cut() const { return m_q_cut; } + Real_t v_cut() const { return m_v_cut; } + + // Other constants (usually are settable via input file in real codes) + Real_t hgcoef() const { return m_hgcoef; } + Real_t qstop() const { return m_qstop; } + Real_t monoq_max_slope() const { return m_monoq_max_slope; } + Real_t monoq_limiter_mult() const { return m_monoq_limiter_mult; } + Real_t ss4o3() const { return m_ss4o3; } + Real_t qlc_monoq() const { return m_qlc_monoq; } + Real_t qqc_monoq() const { return m_qqc_monoq; } + Real_t qqc() const { return m_qqc; } + + Real_t eosvmax() const { return m_eosvmax; } + Real_t eosvmin() const { return m_eosvmin; } + Real_t pmin() const { return m_pmin; } + Real_t emin() const { return m_emin; } + Real_t dvovmax() const { return m_dvovmax; } + Real_t refdens() const { return m_refdens; } + + // Timestep controls, etc... + Real_t& time() { return m_time; } + Real_t& deltatime() { return m_deltatime; } + Real_t& deltatimemultlb() { return m_deltatimemultlb; } + Real_t& deltatimemultub() { return m_deltatimemultub; } + Real_t& stoptime() { return m_stoptime; } + Real_t& dtcourant() { return m_dtcourant; } + Real_t& dthydro() { return m_dthydro; } + Real_t& dtmax() { return m_dtmax; } + Real_t& dtfixed() { return m_dtfixed; } + + Int_t& cycle() { return m_cycle; } + Index_t& numRanks() { return m_numRanks; } + + Index_t& colLoc() { return m_colLoc; } + Index_t& rowLoc() { return m_rowLoc; } + Index_t& planeLoc() { return m_planeLoc; } + Index_t& tp() { return m_tp; } + + Index_t& sizeX() { return m_sizeX; } + Index_t& sizeY() { return m_sizeY; } + Index_t& sizeZ() { return m_sizeZ; } + Index_t& numReg() { return m_numReg; } + Int_t& cost() { return m_cost; } + Index_t& numElem() { return m_numElem; } + Index_t& numNode() { return m_numNode; } + + Index_t& maxPlaneSize() { return m_maxPlaneSize; } + Index_t& maxEdgeSize() { return m_maxEdgeSize; } + + // + // MPI-Related additional data + // + +#if USE_MPI + // Communication Work space + Real_t* commDataSend; + Real_t* commDataRecv; + + // Maximum number of block neighbors + MPI_Request recvRequest[26]; // 6 faces + 12 edges + 8 corners + MPI_Request sendRequest[26]; // 6 faces + 12 edges + 8 corners +#endif + +private: + void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); + void SetupThreadSupportStructures(); + void CreateRegionIndexSets(Int_t nreg, Int_t balance); + void SetupCommBuffers(Int_t edgeNodes); + void SetupSymmetryPlanes(Int_t edgeNodes); + void SetupElementConnectivities(Int_t edgeElems); + void SetupBoundaryConditions(Int_t edgeElems); + + // + // IMPLEMENTATION + // + + /* Node-centered */ + + struct Tuple3 + { + Real_t x, y, z; + }; + + Kokkos::vector m_coord; /* coordinates */ + + Kokkos::vector m_vel; /* velocities */ + + Kokkos::vector m_acc; /* accelerations */ + + Kokkos::vector m_force; /* forces */ + + Kokkos::vector m_nodalMass; /* mass */ + + Kokkos::vector m_symmX; /* symmetry plane nodesets */ + Kokkos::vector m_symmY; + Kokkos::vector m_symmZ; + + // Element-centered + + // Region information + Int_t m_numReg; + Int_t m_cost; // imbalance cost + Index_t* m_regElemSize; // Size of region sets + Index_t* m_regNumList; // Region number per domain element + Index_t** m_regElemlist; // region indexset + + Kokkos::vector m_nodelist; /* elemToNode connectivity */ + + struct FaceElemConn + { + Index_t lxim, lxip, letam, letap, lzetam, lzetap; + }; + + Kokkos::vector m_faceToElem; /* element conn across faces */ + + Kokkos::vector m_elemBC; /* symmetry/free-surface flags for each elem face */ + + Kokkos::vector m_dxx; /* principal strains -- temporary */ + Kokkos::vector m_dyy; + Kokkos::vector m_dzz; + + Kokkos::vector m_delv_xi; /* velocity gradient -- temporary */ + Kokkos::vector m_delv_eta; + Kokkos::vector m_delv_zeta; + + Kokkos::vector m_delx_xi; /* coordinate gradient -- temporary */ + Kokkos::vector m_delx_eta; + Kokkos::vector m_delx_zeta; + + Kokkos::vector m_e; /* energy */ + + struct Pcomponents + { + Real_t p, q; + }; + + Kokkos::vector m_pq; /* pressure and artificial viscosity */ + + struct Qcomponents + { + Real_t ql, qq; + }; + + Kokkos::vector m_qlqq; /* linear and quadratic terms for q */ + + struct Volume + { + Real_t v, volo; + }; + + Kokkos::vector m_vol; /* relative and reference volume */ + + Kokkos::vector m_vnew; /* new relative volume -- temporary */ + Kokkos::vector m_delv; /* m_vnew - m_v */ + Kokkos::vector m_vdov; /* volume derivative over volume */ + + Kokkos::vector m_arealg; /* characteristic length of an element */ + + Kokkos::vector m_ss; /* "sound speed" */ + + Kokkos::vector m_elemMass; /* mass */ + + // Cutoffs (treat as constants) + const Real_t m_e_cut; // energy tolerance + const Real_t m_p_cut; // pressure tolerance + const Real_t m_q_cut; // q tolerance + const Real_t m_v_cut; // relative volume tolerance + const Real_t m_u_cut; // velocity tolerance + + // Other constants (usually setable, but hardcoded in this proxy app) + + const Real_t m_hgcoef; // hourglass control + const Real_t m_ss4o3; + const Real_t m_qstop; // excessive q indicator + const Real_t m_monoq_max_slope; + const Real_t m_monoq_limiter_mult; + const Real_t m_qlc_monoq; // linear term coef for q + const Real_t m_qqc_monoq; // quadratic term coef for q + const Real_t m_qqc; + const Real_t m_eosvmax; + const Real_t m_eosvmin; + const Real_t m_pmin; // pressure floor + const Real_t m_emin; // energy floor + const Real_t m_dvovmax; // maximum allowable volume change + const Real_t m_refdens; // reference density + + // Variables to keep track of timestep, simulation time, and cycle + Real_t m_dtcourant; // courant constraint + Real_t m_dthydro; // volume change constraint + Int_t m_cycle; // iteration count for simulation + Real_t m_dtfixed; // fixed time increment + Real_t m_time; // current time + Real_t m_deltatime; // variable time increment + Real_t m_deltatimemultlb; + Real_t m_deltatimemultub; + Real_t m_dtmax; // maximum allowable time increment + Real_t m_stoptime; // end time for simulation + + Int_t m_numRanks; + + Index_t m_colLoc; + Index_t m_rowLoc; + Index_t m_planeLoc; + Index_t m_tp; + + Index_t m_sizeX; + Index_t m_sizeY; + Index_t m_sizeZ; + Index_t m_numElem; + Index_t m_numNode; + + Index_t m_maxPlaneSize; + Index_t m_maxEdgeSize; + + // OMP hack + Index_t* m_nodeElemStart; + Index_t* m_nodeElemCornerList; + + // Used in setup + Index_t m_rowMin, m_rowMax; + Index_t m_colMin, m_colMax; + Index_t m_planeMin, m_planeMax; +}; + +typedef Real_t& (Domain::*Domain_member)(Index_t); + +struct cmdLineOpts +{ + Int_t its; // -i + Int_t nx; // -s + Int_t numReg; // -r + Int_t numFiles; // -f + Int_t showProg; // -p + Int_t quiet; // -q + Int_t viz; // -v + Int_t cost; // -c + Int_t balance; // -b +}; + +// Function Prototypes + +// lulesh-par +Real_t +CalcElemVolume(const Real_t x[8], const Real_t y[8], const Real_t z[8]); + +// lulesh-util +void +ParseCommandLineOptions(int argc, char* argv[], Int_t myRank, struct cmdLineOpts* opts); +void +VerifyAndWriteFinalOutput(Real_t elapsed_time, Domain& locDom, Int_t nx, Int_t numRanks); + +// lulesh-viz +void +DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); + +// lulesh-comm +void +CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, Index_t dx, Index_t dy, + Index_t dz, bool doRecv, bool planeOnly); +void +CommSend(Domain& domain, Int_t msgType, Index_t xferFields, Domain_member* fieldData, + Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly); +void +CommSBN(Domain& domain, Int_t xferFields, Domain_member* fieldData); +void +CommSyncPosVel(Domain& domain); +void +CommMonoQ(Domain& domain); + +// lulesh-init +void +InitMeshDecomp(Int_t numRanks, Int_t myRank, Int_t* col, Int_t* row, Int_t* plane, + Int_t* side); diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt index 67381a4f51..3acd578282 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) project(omnitrace-parallel-overhead LANGUAGES CXX) diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp index bc08ce8437..291401ccc3 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp +++ b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp @@ -36,6 +36,9 @@ main(int argc, char** argv) if(argc > 2) nthread = atol(argv[2]); if(argc > 3) nitr = atol(argv[3]); + printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", argv[0], + nthread, argv[0], nitr, argv[0], nfib); + std::vector threads{}; for(size_t i = 0; i < nthread; ++i) { @@ -43,10 +46,11 @@ main(int argc, char** argv) threads.emplace_back(&run, _nitr, nfib); } + run(nitr - 0.25 * nitr, nfib - 0.1 * nfib); for(auto& itr : threads) itr.join(); - printf("fibonacci(%li) x %lu = %li\n", nfib, nthread, total.load()); + printf("[%s] fibonacci(%li) x %lu = %li\n", argv[0], nfib, nthread, total.load()); return 0; } diff --git a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt index c031d1be15..c918563590 100644 --- a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) project(omnitrace-transpose LANGUAGES CXX) diff --git a/projects/rocprofiler-systems/examples/transpose/transpose.cpp b/projects/rocprofiler-systems/examples/transpose/transpose.cpp index abf043c030..74cfbc183a 100644 --- a/projects/rocprofiler-systems/examples/transpose/transpose.cpp +++ b/projects/rocprofiler-systems/examples/transpose/transpose.cpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include "hip/hip_runtime.h" + #include #include #include @@ -29,14 +30,19 @@ THE SOFTWARE. #include #include #include +#include #include #include +static std::mutex print_lock{}; +using auto_lock_t = std::unique_lock; + #define HIP_API_CALL(CALL) \ { \ hipError_t error_ = (CALL); \ if(error_ != hipSuccess) \ { \ + auto_lock_t _lk{ print_lock }; \ fprintf(stderr, "%s:%d :: HIP error : %s\n", __FILE__, __LINE__, \ hipGetErrorString(error_)); \ exit(EXIT_FAILURE); \ @@ -49,6 +55,7 @@ check_hip_error(void) hipError_t err = hipGetLastError(); if(err != hipSuccess) { + auto_lock_t _lk{ print_lock }; std::cerr << "Error: " << hipGetErrorString(err) << std::endl; exit(err); } @@ -63,6 +70,7 @@ verify(int* in, int* out, int M, int N) int col = rand() % N; if(in[row * N + col] != out[col * M + row]) { + auto_lock_t _lk{ print_lock }; std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | " << out[col * M + row] << "\n"; } @@ -85,19 +93,23 @@ transpose_a(int* in, int* out, int M, int N) } void -run(int rank, int argc, char** argv) +run(int rank, int tid, hipStream_t stream, int argc, char** argv) { - (void) argc; - (void) argv; - unsigned int M = 4960 * 2; - unsigned int N = 4960 * 2; + size_t nitr = 5000; + unsigned int M = 4960 * 2; + unsigned int N = 4960 * 2; + if(argc > 2) nitr = atoll(argv[2]); + + auto_lock_t _lk{ print_lock }; + std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + _lk.unlock(); - std::cout << "[" << rank << "] M: " << M << " N: " << N << std::endl; size_t size = sizeof(int) * M * N; - int* matrix = (int*) malloc(size); + int* matrix = new int[size]; for(size_t i = 0; i < M * N; i++) matrix[i] = rand() % 1002; - int *in, *out; + int* in = nullptr; + int* out = nullptr; std::chrono::high_resolution_clock::time_point t1, t2; @@ -106,37 +118,36 @@ run(int rank, int argc, char** argv) HIP_API_CALL(hipMemset(in, 0, size)); HIP_API_CALL(hipMemset(out, 0, size)); HIP_API_CALL(hipMemcpy(in, matrix, size, hipMemcpyHostToDevice)); + HIP_API_CALL(hipDeviceSynchronize()); + hipDeviceProp_t props; HIP_API_CALL(hipGetDeviceProperties(&props, 0)); dim3 grid(M / 32, N / 32, 1); dim3 block(32, 32, 1); // transpose_a - t1 = std::chrono::high_resolution_clock::now(); - const unsigned times = 10000; - auto _func = [&](hipStream_t stream) { - for(size_t i = 0; i < times / 2; i++) - { - transpose_a<<>>(in, out, M, N); - check_hip_error(); - } - HIP_API_CALL(hipStreamSynchronize(stream)); - }; - hipStream_t _stream{}; - HIP_API_CALL(hipStreamCreate(&_stream)); - std::thread _t{ _func, _stream }; - _t.join(); - _func(0); - HIP_API_CALL(hipDeviceSynchronize()); + t1 = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < nitr; i++) + { + transpose_a<<>>(in, out, M, N); + check_hip_error(); + } + HIP_API_CALL(hipStreamSynchronize(stream)); t2 = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast>(t2 - t1).count(); - float GB = (float) size * times * 2 / (1 << 30); - std::cout << "[" << rank << "] Runtime of transpose is " << time << " sec\n" + float GB = (float) size * nitr * 2 / (1 << 30); + + print_lock.lock(); + std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n" << "The average performance of transpose is " << GB / time << " GBytes/sec" << std::endl; + print_lock.unlock(); - int* out_matrix = (int*) malloc(size); + HIP_API_CALL(hipDeviceSynchronize()); + + int* out_matrix = new int[size]; HIP_API_CALL(hipMemcpy(out_matrix, out, size, hipMemcpyDeviceToHost)); // cpu_transpose(matrix, out_matrix, M, N); @@ -145,8 +156,8 @@ run(int rank, int argc, char** argv) HIP_API_CALL(hipFree(in)); HIP_API_CALL(hipFree(out)); - free(matrix); - free(out_matrix); + delete[] matrix; + delete[] out_matrix; } #if defined(USE_MPI) @@ -174,12 +185,16 @@ main(int argc, char** argv) int rank = 0; int size = 1; int nthreads = 2; + int nitr = 5000; if(argc > 1) nthreads = atoi(argv[1]); + if(argc > 2) nitr = atoi(argv[2]); #if defined(USE_MPI) MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); +#else + (void) size; #endif // this is a temporary workaround in omnitrace when HIP + MPI is enabled int ndevice = 0; @@ -193,16 +208,24 @@ main(int argc, char** argv) if(rank == devid && rank < ndevice) { std::vector _threads{}; + std::vector _streams(nthreads); + for(int i = 0; i < nthreads; ++i) + HIP_API_CALL(hipStreamCreate(&_streams.at(i))); for(int i = 1; i < nthreads; ++i) - _threads.emplace_back(run, rank, argc, argv); - run(rank, argc, argv); + _threads.emplace_back(run, rank, i, _streams.at(i), argc, argv); + run(rank, 0, _streams.at(0), argc, argv); for(auto& itr : _threads) itr.join(); + for(int i = 0; i < nthreads; ++i) + HIP_API_CALL(hipStreamDestroy(_streams.at(i))); } #if defined(USE_MPI) MPI_Barrier(MPI_COMM_WORLD); do_a2a(rank); MPI_Finalize(); #endif + HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipDeviceReset()); + return 0; } diff --git a/projects/rocprofiler-systems/external/PTL b/projects/rocprofiler-systems/external/PTL index dd1b67829c..61f873cf79 160000 --- a/projects/rocprofiler-systems/external/PTL +++ b/projects/rocprofiler-systems/external/PTL @@ -1 +1 @@ -Subproject commit dd1b67829c9875bd78016d3aad35f22890b59b6c +Subproject commit 61f873cf79a016b0572c04f2df075a75a66389aa diff --git a/projects/rocprofiler-systems/external/dyninst b/projects/rocprofiler-systems/external/dyninst index 076d8bdef4..82b10fdcf5 160000 --- a/projects/rocprofiler-systems/external/dyninst +++ b/projects/rocprofiler-systems/external/dyninst @@ -1 +1 @@ -Subproject commit 076d8bdef4f22639d16ca65cda9b909b6c726047 +Subproject commit 82b10fdcf589e084f6491ed08668ab60c02dae0e diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index c040fe7022..335abea0c5 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit c040fe702285e7b4345d4ccfdb8a08704dbeeb83 +Subproject commit 335abea0c51e498cf419716839690cd0ccb1aeac diff --git a/projects/rocprofiler-systems/include/avail.hpp b/projects/rocprofiler-systems/include/avail.hpp new file mode 100644 index 0000000000..c14f68e218 --- /dev/null +++ b/projects/rocprofiler-systems/include/avail.hpp @@ -0,0 +1,337 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +/** \file timemory/tools/available.hpp + * \headerfile tools/available.hpp "tools/available.hpp" + * Handles serializing the settings + * + */ + +#pragma once + +#define TIMEMORY_DISABLE_BANNER +#define TIMEMORY_DISABLE_COMPONENT_STORAGE_INIT + +#include "timemory/settings/macros.hpp" +#include "timemory/tpls/cereal/archives.hpp" +#include "timemory/tpls/cereal/cereal/external/base64.hpp" +#include "timemory/utility/demangle.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME) +# define TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME epilogue +#endif + +#if !defined(TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME) +# define TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME prologue +#endif + +//======================================================================================// + +namespace tim +{ +namespace cereal +{ +class SettingsTextArchive +: public OutputArchive +, public traits::TextArchive +{ +public: + using width_type = std::vector; + using value_type = std::string; + using entry_type = std::map; + using array_type = std::vector; + using unique_set = std::set; + using int_stack = std::stack; + +public: + //! Construct, outputting to the provided stream + /// \param stream The array of output data + SettingsTextArchive(array_type& stream, unique_set exclude) + : OutputArchive(this) + , output_stream(&stream) + , exclude_stream(std::move(exclude)) + { + name_counter.push(0); + } + + ~SettingsTextArchive() override = default; + + void saveBinaryValue(const void* data, size_t size, const char* name = nullptr) + { + setNextName(name); + writeName(); + + auto base64string = + base64::encode(reinterpret_cast(data), size); + saveValue(base64string); + } + + void startNode() { name_counter.push(0); } + + void finishNode() { name_counter.pop(); } + + //! Sets the name for the next node created with startNode + void setNextName(const char* name) + { + if(exclude_stream.count(name) > 0) return; + + if((current_entry != nullptr) && value_keys.count(name) > 0) + { + current_entry->insert({ name, "" }); + current_value = &((*current_entry)[name]); + return; + } + if(value_keys.count(name) > 0) + { + return; + } + + current_value = nullptr; + output_stream->push_back(entry_type{}); + current_entry = &(output_stream->back()); + + current_entry->insert({ "identifier", name }); + std::string func = name; + const std::string prefix = TIMEMORY_SETTINGS_PREFIX; + func = func.erase(0, prefix.length()); + std::transform(func.begin(), func.end(), func.begin(), + [](char& c) { return tolower(c); }); + { + std::stringstream ss; + ss << "settings::" << func << "()"; + current_entry->insert({ "static_accessor", ss.str() }); + } + { + std::stringstream ss; + ss << "settings::instance()->get_" << func << "()"; + current_entry->insert({ "member_accessor", ss.str() }); + } + { + std::stringstream ss; + ss << "settings." << func; + current_entry->insert({ "python_accessor", ss.str() }); + } + } + + void setNextType(const char*) {} + +public: + template + inline void saveValue(Tp _val) + { + std::stringstream ssval; + ssval << std::boolalpha << _val; + if(current_value) + { + *current_value = ssval.str(); + } + } + + void writeName() {} + + void makeArray() {} + +private: + value_type* current_value = nullptr; + entry_type* current_entry = nullptr; + array_type* output_stream = nullptr; + unique_set exclude_stream = {}; + int_stack name_counter; + unique_set value_keys = { "name", "value", "description", "count", + "environ", "max_count", "cmdline", "data_type", + "initial", "categories" }; +}; + +//======================================================================================// +// +// prologue and epilogue functions +// +//======================================================================================// + +//--------------------------------------------------------------------------------------// +//! Prologue for NVPs for settings archive +/*! NVPs do not start or finish nodes - they just set up the names */ +template +inline void +TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME(SettingsTextArchive&, const NameValuePair&) +{} + +//--------------------------------------------------------------------------------------// +//! Epilogue for NVPs for settings archive +/*! NVPs do not start or finish nodes - they just set up the names */ +template +inline void +TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME(SettingsTextArchive&, const NameValuePair&) +{} + +//--------------------------------------------------------------------------------------// +//! Prologue for deferred data for settings archive +/*! Do nothing for the defer wrapper */ +template +inline void +TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME(SettingsTextArchive&, const DeferredData&) +{} + +//--------------------------------------------------------------------------------------// +//! Epilogue for deferred for settings archive +/*! NVPs do not start or finish nodes - they just set up the names */ +template +inline void +TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME(SettingsTextArchive&, const DeferredData&) +{} + +//--------------------------------------------------------------------------------------// +//! Prologue for SizeTags for settings archive +/*! SizeTags are ignored */ +template +inline void +TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME(SettingsTextArchive& ar, const SizeTag&) +{ + ar.makeArray(); +} + +//--------------------------------------------------------------------------------------// +//! Epilogue for SizeTags for settings archive +/*! SizeTags are ignored */ +template +inline void +TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME(SettingsTextArchive&, const SizeTag&) +{} + +//--------------------------------------------------------------------------------------// +//! Prologue for all other types for settings archive +/*! Starts a new node, named either automatically or by some NVP, + that may be given data by the type about to be archived*/ +template +inline void +TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME(SettingsTextArchive& ar, const T&) +{ + ar.startNode(); +} + +//--------------------------------------------------------------------------------------// +//! Epilogue for all other types other for settings archive +/*! Finishes the node created in the prologue*/ +template +inline void +TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME(SettingsTextArchive& ar, const T&) +{ + ar.finishNode(); +} + +//--------------------------------------------------------------------------------------// +//! Prologue for arithmetic types for settings archive +inline void +TIMEMORY_CEREAL_PROLOGUE_FUNCTION_NAME(SettingsTextArchive&, const std::nullptr_t&) +{} + +//--------------------------------------------------------------------------------------// +//! Epilogue for arithmetic types for settings archive +inline void +TIMEMORY_CEREAL_EPILOGUE_FUNCTION_NAME(SettingsTextArchive&, const std::nullptr_t&) +{} + +//======================================================================================// +// +// Common serialization functions +// +//======================================================================================// + +//! Serializing NVP types +template +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME(SettingsTextArchive& ar, const NameValuePair& t) +{ + ar.setNextName(t.name); + if(std::is_same::value) + { + ar.setNextType("string"); + } + else + { + ar.setNextType(tim::demangle().c_str()); + } + ar(t.value); +} + +template +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME( + SettingsTextArchive& ar, + const NameValuePair>& t) +{ + ar.setNextName(t.name); + ar.setNextType("string"); + ar(t.value); +} + +//! Saving for nullptr +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME(SettingsTextArchive&, const std::nullptr_t&) +{} + +//! Saving for arithmetic +template ::value> = traits::sfinae> +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME(SettingsTextArchive& ar, const T& t) +{ + if(std::is_same::value) ar.setNextType("string"); + ar.saveValue(t); +} + +//! saving string +template +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME(SettingsTextArchive& ar, + const std::basic_string& str) +{ + ar.setNextType("string"); + ar.saveValue(str); +} + +//--------------------------------------------------------------------------------------// +//! Saving SizeTags +template +inline void +TIMEMORY_CEREAL_SAVE_FUNCTION_NAME(SettingsTextArchive&, const SizeTag&) +{ + // nothing to do here, we don't explicitly save the size +} + +} // namespace cereal +} // namespace tim + +// register archives for polymorphic support +TIMEMORY_CEREAL_REGISTER_ARCHIVE(SettingsTextArchive) diff --git a/projects/rocprofiler-systems/include/library.hpp b/projects/rocprofiler-systems/include/library.hpp index 202ced1ff3..0433a968df 100644 --- a/projects/rocprofiler-systems/include/library.hpp +++ b/projects/rocprofiler-systems/include/library.hpp @@ -34,10 +34,10 @@ // clang-format on #include "library/timemory.hpp" -#include "library/roctracer.hpp" +#include "library/components/roctracer.hpp" #include "library/api.hpp" -#include "library/fork_gotcha.hpp" -#include "library/mpi_gotcha.hpp" +#include "library/components/fork_gotcha.hpp" +#include "library/components/mpi_gotcha.hpp" #include "library/api.hpp" #include "library/common.hpp" #include "library/state.hpp" @@ -51,6 +51,8 @@ #include +namespace omnitrace +{ template inline void @@ -74,7 +76,7 @@ add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _paren if constexpr(PhaseID != critical_trace::Phase::NONE) { // unique lock per thread - auto& _mtx = type_mutex(_tid); + auto& _mtx = type_mutex(_tid); auto_lock_t _lk{ _mtx }; auto& _critical_trace = critical_trace::get(_tid); @@ -86,7 +88,7 @@ add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _paren if constexpr(UpdateStack) { // unique lock per thread - auto& _mtx = type_mutex(_tid); + auto& _mtx = type_mutex(_tid); if constexpr(PhaseID == critical_trace::Phase::NONE) { @@ -110,3 +112,4 @@ add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _paren tim::consume_parameters(_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash, _depth, _prio); } +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/api.hpp b/projects/rocprofiler-systems/include/library/api.hpp index 6787f9cefb..d5ff83c522 100644 --- a/projects/rocprofiler-systems/include/library/api.hpp +++ b/projects/rocprofiler-systems/include/library/api.hpp @@ -28,6 +28,8 @@ #pragma once +#include "library/defines.hpp" + #include // forward decl of the API diff --git a/projects/rocprofiler-systems/include/library/common.hpp b/projects/rocprofiler-systems/include/library/common.hpp index 8b6e711027..39a0ae5929 100644 --- a/projects/rocprofiler-systems/include/library/common.hpp +++ b/projects/rocprofiler-systems/include/library/common.hpp @@ -28,6 +28,8 @@ #pragma once +#include "library/defines.hpp" + #include #include #include @@ -45,6 +47,10 @@ #include #include -// timemory api struct -struct omnitrace : tim::concepts::api -{}; +TIMEMORY_DEFINE_NS_API(api, omnitrace) +TIMEMORY_DEFINE_NS_API(api, sampling) + +namespace omnitrace +{ +namespace api = tim::api; // NOLINT +} diff --git a/projects/rocprofiler-systems/include/library/components/backtrace.hpp b/projects/rocprofiler-systems/include/library/components/backtrace.hpp new file mode 100644 index 0000000000..70dfebbb63 --- /dev/null +++ b/projects/rocprofiler-systems/include/library/components/backtrace.hpp @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// with the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of Advanced Micro Devices, Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this Software without specific prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +// THE SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/thread_data.hpp" +#include "library/timemory.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace component +{ +struct backtrace +: tim::component::empty_base +, tim::concepts::component +{ + static constexpr size_t num_hw_counters = 8; + + using data_t = std::array; + using clock_type = std::chrono::steady_clock; + using time_point_type = typename clock_type::time_point; + using value_type = void; + using hw_counters = tim::component::papi_array; + using hw_counter_data_t = typename hw_counters::value_type; + using system_clock = std::chrono::system_clock; + using system_time_point = typename system_clock::time_point; + + static void preinit(); + static std::string label(); + static std::string description(); + + backtrace() = default; + ~backtrace() = default; + backtrace(backtrace&&) = default; + backtrace(const backtrace&) = default; + + backtrace& operator=(const backtrace&) = default; + backtrace& operator=(backtrace&&) = default; + + bool operator<(const backtrace& rhs) const; + + static std::set configure(bool, int64_t _tid = threading::get_id()); + static void post_process(int64_t _tid = threading::get_id()); + static hw_counter_data_t& get_last_hwcounters(); + + static void start(); + static void stop(); + void sample(int = -1); + bool empty() const; + size_t size() const; + std::vector get() const; + time_point_type get_timestamp() const; + int64_t get_thread_cpu_timestamp() const; + +private: + int64_t m_tid = 0; + int64_t m_thr_cpu_ts = 0; + size_t m_size = 0; + time_point_type m_ts = {}; + data_t m_data = {}; + hw_counter_data_t m_hw_counter = {}; +}; +} // namespace component +} // namespace omnitrace + +#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ + (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + +# include + +TIMEMORY_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +TIMEMORY_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +TIMEMORY_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +#endif diff --git a/projects/rocprofiler-systems/include/library/fork_gotcha.hpp b/projects/rocprofiler-systems/include/library/components/fork_gotcha.hpp similarity index 83% rename from projects/rocprofiler-systems/include/library/fork_gotcha.hpp rename to projects/rocprofiler-systems/include/library/components/fork_gotcha.hpp index 8b3894f285..8b3459b327 100644 --- a/projects/rocprofiler-systems/include/library/fork_gotcha.hpp +++ b/projects/rocprofiler-systems/include/library/components/fork_gotcha.hpp @@ -29,8 +29,11 @@ #pragma once #include "library/common.hpp" +#include "library/defines.hpp" #include "library/timemory.hpp" +namespace omnitrace +{ // this is used to wrap fork() struct fork_gotcha : comp::base { @@ -38,11 +41,18 @@ struct fork_gotcha : comp::base TIMEMORY_DEFAULT_OBJECT(fork_gotcha) + // string id for component + static std::string label() { return "fork_gotcha"; } + + // generate the gotcha wrappers + static void configure(); + // this will get called right before fork - void audit(const gotcha_data_t& _data, audit::incoming); + static void audit(const gotcha_data_t& _data, audit::incoming); // this will get called right after fork with the return value - void audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid); + static void audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid); }; -using fork_gotcha_t = comp::gotcha<4, tim::component_tuple, omnitrace>; +using fork_gotcha_t = comp::gotcha<4, tim::component_tuple, api::omnitrace>; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/components/fwd.hpp b/projects/rocprofiler-systems/include/library/components/fwd.hpp new file mode 100644 index 0000000000..320921a25b --- /dev/null +++ b/projects/rocprofiler-systems/include/library/components/fwd.hpp @@ -0,0 +1,121 @@ +// Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// with the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of Advanced Micro Devices, Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this Software without specific prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +// THE SOFTWARE. + +#pragma once + +#include "library/defines.hpp" +#include "timemory/components/user_bundle/types.hpp" + +#include +#include +#include + +TIMEMORY_DECLARE_COMPONENT(roctracer) + +namespace omnitrace +{ +namespace component +{ +template +using data_tracker = tim::component::data_tracker; + +struct omnitrace; +struct backtrace; +struct backtrace_wall_clock +{}; +struct backtrace_cpu_clock +{}; +struct backtrace_fraction +{}; +using sampling_wall_clock = data_tracker; +using sampling_cpu_clock = data_tracker; +using sampling_percent = data_tracker; +using roctracer = tim::component::roctracer; +} // namespace component +} // namespace omnitrace + +#if !defined(OMNITRACE_USE_ROCTRACER) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) +#endif + +#if !defined(TIMEMORY_USE_LIBUNWIND) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::sampling::backtrace, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_wall_clock, + false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_cpu_clock, + false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_percent, + false_type) +#endif + +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::omnitrace, OMNITRACE_COMPONENT, + "omnitrace", "omnitrace_component") +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::roctracer, OMNITRACE_ROCTRACER, + "roctracer", "omnitrace_roctracer") +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_wall_clock, + OMNITRACE_SAMPLING_WALL_CLOCK, "sampling_wall_clock", "") +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_cpu_clock, + OMNITRACE_SAMPLING_CPU_CLOCK, "sampling_cpu_clock", "") +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_percent, + OMNITRACE_SAMPLING_PERCENT, "sampling_percent", "") + +TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::sampling_wall_clock, + "sampling_wall_clock", "Wall-clock timing", + "derived from statistical sampling") +TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::sampling_cpu_clock, + "sampling_cpu_clock", "CPU-clock timing", + "derived from statistical sampling") +TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::sampling_percent, + "sampling_percent", + "Fraction of wall-clock time spent in functions", + "derived from statistical sampling") +TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::roctracer, "roctracer", + "High-precision ROCm API and kernel tracing", "") + +TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_wall_clock, double) +TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_cpu_clock, double) + +// enable timing units +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, + omnitrace::component::sampling_wall_clock, true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, + omnitrace::component::sampling_wall_clock, true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, + omnitrace::component::sampling_cpu_clock, true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, + omnitrace::component::sampling_cpu_clock, true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, omnitrace::component::sampling_percent, + true_type) + +TIMEMORY_DEFINE_CONCRETE_TRAIT(report_mean, omnitrace::component::sampling_percent, + false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(report_units, omnitrace::component::sampling_percent, + false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(report_statistics, omnitrace::component::sampling_percent, + false_type) diff --git a/projects/rocprofiler-systems/include/library/mpi_gotcha.hpp b/projects/rocprofiler-systems/include/library/components/mpi_gotcha.hpp similarity index 88% rename from projects/rocprofiler-systems/include/library/mpi_gotcha.hpp rename to projects/rocprofiler-systems/include/library/components/mpi_gotcha.hpp index 73cd49188a..971c6968ff 100644 --- a/projects/rocprofiler-systems/include/library/mpi_gotcha.hpp +++ b/projects/rocprofiler-systems/include/library/components/mpi_gotcha.hpp @@ -29,8 +29,11 @@ #pragma once #include "library/common.hpp" +#include "library/defines.hpp" #include "library/timemory.hpp" +namespace omnitrace +{ // this is used to wrap MPI_Init and MPI_Init_thread struct mpi_gotcha : comp::base { @@ -39,6 +42,12 @@ struct mpi_gotcha : comp::base TIMEMORY_DEFAULT_OBJECT(mpi_gotcha) + // string id for component + static std::string label() { return "mpi_gotcha"; } + + // generate the gotcha wrappers + static void configure(); + // called right before MPI_Init with that functions arguments static void audit(const gotcha_data_t& _data, audit::incoming, int*, char***); @@ -56,9 +65,10 @@ struct mpi_gotcha : comp::base void audit(const gotcha_data_t& _data, audit::outgoing, int _retval); private: - comm_t m_comm = tim::mpi::comm_world_v; - int* m_rank = nullptr; - int* m_size = nullptr; + void* m_comm = nullptr; + int* m_rank = nullptr; + int* m_size = nullptr; }; -using mpi_gotcha_t = comp::gotcha<5, tim::component_tuple, omnitrace>; +using mpi_gotcha_t = comp::gotcha<5, tim::component_tuple, api::omnitrace>; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/omnitrace_component.hpp b/projects/rocprofiler-systems/include/library/components/omnitrace.hpp similarity index 76% rename from projects/rocprofiler-systems/include/library/omnitrace_component.hpp rename to projects/rocprofiler-systems/include/library/components/omnitrace.hpp index 0dfd05704a..5268e8914a 100644 --- a/projects/rocprofiler-systems/include/library/omnitrace_component.hpp +++ b/projects/rocprofiler-systems/include/library/components/omnitrace.hpp @@ -28,16 +28,29 @@ #pragma once +#include "library/defines.hpp" #include "library/timemory.hpp" +namespace omnitrace +{ +namespace component +{ // timemory component which calls omnitrace functions // (used in gotcha wrappers) -struct omnitrace_component : comp::base +struct omnitrace : comp::base { - void start(); - void stop(); - void set_prefix(const char*); + static std::string label() { return "omnitrace"; } + void start(); + void stop(); + void set_prefix(const char*); private: const char* m_prefix = nullptr; }; +} // namespace component +} // namespace omnitrace + +TIMEMORY_METADATA_SPECIALIZATION( + omnitrace::component::omnitrace, "omnitrace", + "Invokes instrumentation functions 'omnitrace_push_trace' and 'omnitrace_pop_trace'", + "Used by gotcha wrappers") diff --git a/projects/rocprofiler-systems/include/library/components/pthread_gotcha.hpp b/projects/rocprofiler-systems/include/library/components/pthread_gotcha.hpp new file mode 100644 index 0000000000..d4b1c379e6 --- /dev/null +++ b/projects/rocprofiler-systems/include/library/components/pthread_gotcha.hpp @@ -0,0 +1,75 @@ +// Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// with the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of Advanced Micro Devices, Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this Software without specific prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +// THE SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include + +namespace omnitrace +{ +struct pthread_gotcha : tim::component::base +{ + struct wrapper + { + using routine_t = void* (*) (void*); + using promise_t = std::promise; + + wrapper(routine_t _routine, void* _arg, bool, promise_t*); + void* operator()() const; + + static void* wrap(void* _arg); + + private: + bool m_enable_sampling = false; + routine_t m_routine = nullptr; + void* m_arg = nullptr; + promise_t* m_promise = nullptr; + }; + + TIMEMORY_DEFAULT_OBJECT(pthread_gotcha) + + // string id for component + static std::string label() { return "pthread_gotcha"; } + + // generate the gotcha wrappers + static void configure(); + + // threads can set this to avoid starting sampling on child threads + static bool& enable_sampling_on_child_threads(); + + // pthread_create + int operator()(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg) const; +}; + +using pthread_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_gotcha>; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/roctracer.hpp b/projects/rocprofiler-systems/include/library/components/roctracer.hpp similarity index 84% rename from projects/rocprofiler-systems/include/library/roctracer.hpp rename to projects/rocprofiler-systems/include/library/components/roctracer.hpp index b3847c6056..3d6be85b7d 100644 --- a/projects/rocprofiler-systems/include/library/roctracer.hpp +++ b/projects/rocprofiler-systems/include/library/components/roctracer.hpp @@ -28,20 +28,17 @@ #pragma once -#include "timemory/api.hpp" -#include "timemory/components/base.hpp" -#include "timemory/components/data_tracker/components.hpp" -#include "timemory/components/macros.hpp" -#include "timemory/enum.h" -#include "timemory/macros/os.hpp" -#include "timemory/mpl/type_traits.hpp" -#include "timemory/mpl/types.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" -TIMEMORY_DECLARE_COMPONENT(roctracer) - -#if !defined(OMNITRACE_USE_ROCTRACER) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) -#endif +#include +#include +#include +#include +#include +#include +#include +#include namespace tim { @@ -86,7 +83,12 @@ TIMEMORY_SET_COMPONENT_API(component::roctracer_data, project::timemory, categor TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type) -#include "timemory/operations.hpp" +#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ + (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + +# include TIMEMORY_DECLARE_EXTERN_COMPONENT(roctracer, false, void) TIMEMORY_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double) + +#endif diff --git a/projects/rocprofiler-systems/include/library/roctracer_callbacks.hpp b/projects/rocprofiler-systems/include/library/components/roctracer_callbacks.hpp similarity index 87% rename from projects/rocprofiler-systems/include/library/roctracer_callbacks.hpp rename to projects/rocprofiler-systems/include/library/components/roctracer_callbacks.hpp index e1e7ee92cb..50f0478812 100644 --- a/projects/rocprofiler-systems/include/library/roctracer_callbacks.hpp +++ b/projects/rocprofiler-systems/include/library/components/roctracer_callbacks.hpp @@ -28,12 +28,12 @@ #pragma once +#include "library/components/roctracer.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/dynamic_library.hpp" #include "library/perfetto.hpp" #include "library/ptl.hpp" -#include "library/roctracer.hpp" #include #include @@ -58,12 +58,15 @@ } \ } while(0) -using hsa_timer_t = hsa_rt_utils::Timer; -using timestamp_t = hsa_timer_t::timestamp_t; -using roctracer_bundle_t = tim::component_bundle; -using roctracer_hsa_bundle_t = tim::component_bundle; -using roctracer_functions_t = std::vector>>; +namespace omnitrace +{ +using hsa_timer_t = hsa_rt_utils::Timer; +using timestamp_t = hsa_timer_t::timestamp_t; +using roctracer_bundle_t = + tim::component_bundle; +using roctracer_hsa_bundle_t = + tim::component_bundle; +using roctracer_functions_t = std::vector>>; std::unique_ptr& get_hsa_timer(); @@ -94,3 +97,4 @@ roctracer_setup_routines(); roctracer_functions_t& roctracer_tear_down_routines(); +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/config.hpp b/projects/rocprofiler-systems/include/library/config.hpp index 8b67261fa3..586d00a016 100644 --- a/projects/rocprofiler-systems/include/library/config.hpp +++ b/projects/rocprofiler-systems/include/library/config.hpp @@ -30,9 +30,11 @@ #include "library/api.hpp" #include "library/common.hpp" -#include "library/fork_gotcha.hpp" -#include "library/mpi_gotcha.hpp" -#include "library/roctracer.hpp" +#include "library/components/fork_gotcha.hpp" +#include "library/components/mpi_gotcha.hpp" +#include "library/components/pthread_gotcha.hpp" +#include "library/components/roctracer.hpp" +#include "library/defines.hpp" #include "library/state.hpp" #include "library/timemory.hpp" @@ -40,37 +42,42 @@ #include +namespace omnitrace +{ // bundle of components around omnitrace_init and omnitrace_finalize using main_bundle_t = tim::lightweight_tuple; + comp::cpu_util, comp::roctracer, comp::user_global_bundle, + fork_gotcha_t, mpi_gotcha_t, pthread_gotcha_t>; // bundle of components used in instrumentation using instrumentation_bundle_t = - tim::component_bundle; + tim::component_bundle; // allocator for instrumentation_bundle_t using bundle_allocator_t = tim::data::ring_buffer_allocator; // bundle of components around each thread +#if defined(TIMEMORY_RUSAGE_THREAD) && TIMEMORY_RUSAGE_THREAD > 0 using omnitrace_thread_bundle_t = tim::lightweight_tuple 0 - comp::peak_rss, + comp::thread_cpu_util, comp::peak_rss>; +#else +using omnitrace_thread_bundle_t = + tim::lightweight_tuple; #endif - papi_tot_ins>; // // Initialization routines // void -configure_settings(); +configure_settings() TIMEMORY_VISIBILITY("default"); void -print_config_settings(std::ostream& _os, - std::function&& _filter); +print_config_settings( + std::ostream& _os, + std::function&)>&& _filter); std::string& get_exe_name(); @@ -81,24 +88,39 @@ get_exe_name(); std::string get_config_file(); +bool +get_debug_env(); + bool get_debug(); -bool +bool& get_use_perfetto(); -bool +bool& get_use_timemory(); +bool& +get_use_roctracer(); + +bool& +get_use_sampling(); + bool& get_use_pid(); -bool +bool& get_use_mpip(); -bool +bool& get_use_critical_trace(); +bool +get_timeline_sampling(); + +bool +get_flat_sampling(); + bool get_roctracer_timeline_profile(); @@ -135,14 +157,20 @@ get_trace_hsa_api_types(); std::string& get_backend(); -std::string +std::string& get_perfetto_output_filename(); int64_t get_critical_trace_count(); size_t& -get_sample_rate(); +get_instrumentation_interval(); + +double& +get_sampling_freq(); + +double& +get_sampling_delay(); int64_t get_critical_trace_per_row(); @@ -161,3 +189,4 @@ get_cpu_cid(); std::unique_ptr>& get_cpu_cid_stack(int64_t _tid = threading::get_id()); +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/critical_trace.hpp b/projects/rocprofiler-systems/include/library/critical_trace.hpp index 2437ce513f..797873512a 100644 --- a/projects/rocprofiler-systems/include/library/critical_trace.hpp +++ b/projects/rocprofiler-systems/include/library/critical_trace.hpp @@ -29,8 +29,10 @@ #pragma once #include "library/config.hpp" +#include "library/defines.hpp" #include "library/thread_data.hpp" -#include "timemory/tpls/cereal/cereal/cereal.hpp" + +#include #include #include @@ -38,6 +40,8 @@ #include #include +namespace omnitrace +{ namespace critical_trace { enum class Device : short @@ -207,3 +211,4 @@ struct id {}; } // namespace critical_trace +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/debug.hpp b/projects/rocprofiler-systems/include/library/debug.hpp index 360bf33c30..4d2ad08b18 100644 --- a/projects/rocprofiler-systems/include/library/debug.hpp +++ b/projects/rocprofiler-systems/include/library/debug.hpp @@ -28,17 +28,23 @@ #pragma once -#include +#include "library/defines.hpp" + #include #include #include #include +#include + +namespace omnitrace +{ bool get_debug(); bool get_critical_trace_debug(); +} // namespace omnitrace #if defined(TIMEMORY_USE_MPI) # define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ @@ -74,7 +80,8 @@ get_critical_trace_debug(); fflush(stderr); \ } -#define OMNITRACE_DEBUG(...) OMNITRACE_CONDITIONAL_PRINT(get_debug(), __VA_ARGS__) +#define OMNITRACE_DEBUG(...) \ + OMNITRACE_CONDITIONAL_PRINT(::omnitrace::get_debug(), __VA_ARGS__) #define OMNITRACE_PRINT(...) OMNITRACE_CONDITIONAL_PRINT(true, __VA_ARGS__) #define OMNITRACE_CT_DEBUG(...) \ - OMNITRACE_CONDITIONAL_PRINT(get_critical_trace_debug(), __VA_ARGS__) + OMNITRACE_CONDITIONAL_PRINT(::omnitrace::get_critical_trace_debug(), __VA_ARGS__) diff --git a/projects/rocprofiler-systems/include/library/defines.hpp.in b/projects/rocprofiler-systems/include/library/defines.hpp.in index 450e482e4f..b4abe04de5 100644 --- a/projects/rocprofiler-systems/include/library/defines.hpp.in +++ b/projects/rocprofiler-systems/include/library/defines.hpp.in @@ -33,10 +33,20 @@ #define OMNITRACE_HIP_VERSION_MAJOR @HIP_VERSION_MAJOR@ #define OMNITRACE_HIP_VERSION_MINOR @HIP_VERSION_MINOR@ #define OMNITRACE_HIP_VERSION_PATCH @HIP_VERSION_PATCH@ -// clang-format on #if defined(OMNITRACE_USE_ROCTRACER) # define OMNITRACE_ROCTRACER_LIBKFDWRAPPER "@roctracer_kfdwrapper_LIBRARY@" #else # define OMNITRACE_ROCTRACER_LIBKFDWRAPPER "/opt/rocm/roctracer/lib/libkfdwrapper64.so" #endif +// clang-format on + +#define TIMEMORY_USER_COMPONENT_ENUM \ + OMNITRACE_SAMPLING_WALL_CLOCK_idx, OMNITRACE_SAMPLING_CPU_CLOCK_idx, \ + OMNITRACE_SAMPLING_PERCENT_idx, OMNITRACE_COMPONENT_idx, OMNITRACE_ROCTRACER_idx, + +#define OMNITRACE_COMPONENT OMNITRACE_COMPONENT_idx +#define OMNITRACE_ROCTRACER OMNITRACE_ROCTRACER_idx +#define OMNITRACE_SAMPLING_WALL_CLOCK OMNITRACE_SAMPLING_WALL_CLOCK_idx +#define OMNITRACE_SAMPLING_CPU_CLOCK OMNITRACE_SAMPLING_CPU_CLOCK_idx +#define OMNITRACE_SAMPLING_PERCENT OMNITRACE_SAMPLING_PERCENT_idx diff --git a/projects/rocprofiler-systems/include/library/dynamic_library.hpp b/projects/rocprofiler-systems/include/library/dynamic_library.hpp index 1b0c6afd82..a5784ea254 100644 --- a/projects/rocprofiler-systems/include/library/dynamic_library.hpp +++ b/projects/rocprofiler-systems/include/library/dynamic_library.hpp @@ -29,11 +29,15 @@ #pragma once #include "library/debug.hpp" +#include "library/defines.hpp" + +#include #include #include -#include +namespace omnitrace +{ struct dynamic_library { dynamic_library() = delete; @@ -69,3 +73,4 @@ struct dynamic_library int flags = 0; void* handle = nullptr; }; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/gpu.hpp b/projects/rocprofiler-systems/include/library/gpu.hpp new file mode 100644 index 0000000000..3ef15ae569 --- /dev/null +++ b/projects/rocprofiler-systems/include/library/gpu.hpp @@ -0,0 +1,32 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +namespace omnitrace +{ +namespace gpu +{ +int +device_count(); +} +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/perfetto.hpp b/projects/rocprofiler-systems/include/library/perfetto.hpp index 6d981d454d..cc15aebe7f 100644 --- a/projects/rocprofiler-systems/include/library/perfetto.hpp +++ b/projects/rocprofiler-systems/include/library/perfetto.hpp @@ -28,6 +28,8 @@ #pragma once +#include "library/defines.hpp" + #if defined(PERFETTO_CATEGORIES) # error "PERFETTO_CATEGORIES is already defined. Please include \"" __FILE__ "\" before including any timemory files" #endif @@ -58,6 +60,8 @@ PERFETTO_DEFINE_CATEGORIES(PERFETTO_CATEGORIES); #endif +namespace omnitrace +{ #if defined(CUSTOM_DATA_SOURCE) class CustomDataSource : public perfetto::DataSource { @@ -89,3 +93,4 @@ public: PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); #endif +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/ptl.hpp b/projects/rocprofiler-systems/include/library/ptl.hpp index f47a3847c3..4b7101914e 100644 --- a/projects/rocprofiler-systems/include/library/ptl.hpp +++ b/projects/rocprofiler-systems/include/library/ptl.hpp @@ -28,11 +28,14 @@ #pragma once -#include "PTL/PTL.hh" -#include "timemory/macros/attributes.hpp" +#include "library/defines.hpp" + +#include #include +namespace omnitrace +{ namespace tasking { std::mutex& @@ -53,3 +56,4 @@ get_critical_trace_thread_pool(); PTL::TaskGroup& get_critical_trace_task_group(); } // namespace tasking +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/sampling.hpp b/projects/rocprofiler-systems/include/library/sampling.hpp new file mode 100644 index 0000000000..30d97b9d64 --- /dev/null +++ b/projects/rocprofiler-systems/include/library/sampling.hpp @@ -0,0 +1,76 @@ +// Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// with the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of Advanced Micro Devices, Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this Software without specific prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +// THE SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/components/backtrace.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/thread_data.hpp" +#include "library/timemory.hpp" + +#include +#include +#include + +#include +#include +#include + +namespace omnitrace +{ +namespace sampling +{ +using component::backtrace; +using component::backtrace_cpu_clock; // NOLINT +using component::backtrace_fraction; // NOLINT +using component::backtrace_wall_clock; // NOLINT +using component::sampling_cpu_clock; +using component::sampling_percent; +using component::sampling_wall_clock; + +std::set +setup(); + +std::set +shutdown(); + +void block_signals(std::set = {}); + +void unblock_signals(std::set = {}); + +using bundle_t = tim::lightweight_tuple; +using sampler_t = tim::sampling::sampler; +using sampler_instances = omnitrace_thread_data; + +std::unique_ptr& +get_sampler(int64_t _tid = threading::get_id()); + +} // namespace sampling +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/state.hpp b/projects/rocprofiler-systems/include/library/state.hpp index 2577b04638..86cb34fd79 100644 --- a/projects/rocprofiler-systems/include/library/state.hpp +++ b/projects/rocprofiler-systems/include/library/state.hpp @@ -28,6 +28,10 @@ #pragma once +#include "library/defines.hpp" + +namespace omnitrace +{ // used for specifying the state of omnitrace enum class State : unsigned short { @@ -36,3 +40,4 @@ enum class State : unsigned short Active, Finalized }; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/thread_data.hpp b/projects/rocprofiler-systems/include/library/thread_data.hpp index d5284f4743..0b42873025 100644 --- a/projects/rocprofiler-systems/include/library/thread_data.hpp +++ b/projects/rocprofiler-systems/include/library/thread_data.hpp @@ -29,6 +29,7 @@ #pragma once #include "library/config.hpp" +#include "library/defines.hpp" #include #include @@ -40,6 +41,8 @@ # define OMNITRACE_MAX_THREADS 1024 #endif +namespace omnitrace +{ static constexpr size_t max_supported_threads = OMNITRACE_MAX_THREADS; template @@ -63,8 +66,10 @@ template void omnitrace_thread_data::construct(Args&&... _args) { - static thread_local bool _v = [&_args...]() { - instances().at(threading::get_id()) = + // construct outside of lambda to prevent data-race + static auto& _instances = instances(); + static thread_local bool _v = [&_args...]() { + _instances.at(threading::get_id()) = std::make_unique(std::forward(_args)...); return true; }(); @@ -124,3 +129,4 @@ struct instrumentation_bundles static instance_array_t& instances(); }; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/library/timemory.hpp b/projects/rocprofiler-systems/include/library/timemory.hpp index a33f8fc73f..4f68942b18 100644 --- a/projects/rocprofiler-systems/include/library/timemory.hpp +++ b/projects/rocprofiler-systems/include/library/timemory.hpp @@ -28,36 +28,39 @@ #pragma once +#include "library/components/fwd.hpp" +#include "library/defines.hpp" + #include #include #include #include #include #include -#include #include #include #include -#include +#include #include #include #include #include #include -namespace audit = tim::audit; -namespace comp = tim::component; -namespace quirk = tim::quirk; -namespace threading = tim::threading; -namespace scope = tim::scope; -namespace dmp = tim::dmp; -namespace process = tim::process; -namespace units = tim::units; -namespace trait = tim::trait; +namespace omnitrace +{ +namespace audit = tim::audit; // NOLINT +namespace comp = tim::component; // NOLINT +namespace quirk = tim::quirk; // NOLINT +namespace threading = tim::threading; // NOLINT +namespace scope = tim::scope; // NOLINT +namespace dmp = tim::dmp; // NOLINT +namespace process = tim::process; // NOLINT +namespace units = tim::units; // NOLINT +namespace trait = tim::trait; // NOLINT // same sort of functionality as python's " ".join([...]) #if !defined(JOIN) # define JOIN(...) tim::mpl::apply::join(__VA_ARGS__) #endif - -using papi_tot_ins = comp::papi_tuple; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/include/omnitrace.hpp b/projects/rocprofiler-systems/include/omnitrace.hpp index 9d83528b5f..825bdbb8b7 100644 --- a/projects/rocprofiler-systems/include/omnitrace.hpp +++ b/projects/rocprofiler-systems/include/omnitrace.hpp @@ -32,7 +32,7 @@ #include "timemory/environment.hpp" #include "timemory/mpl/apply.hpp" #include "timemory/utility/argparse.hpp" -#include "timemory/utility/macros.hpp" +#include "timemory/utility/demangle.hpp" #include "timemory/utility/popen.hpp" #include "timemory/variadic/macros.hpp" @@ -49,9 +49,11 @@ #include #include +#include #include #include #include +#include #include #include // @@ -121,23 +123,15 @@ omnitrace_prefork_callback(thread_t* parent, thread_t* child); // // boolean settings // -static bool binary_rewrite = 0; -static bool loop_level_instr = false; -static bool werror = false; -static bool stl_func_instr = false; -static bool use_mpi = false; -static bool is_static_exe = false; -static bool use_return_info = false; -static bool use_args_info = false; -static bool use_file_info = false; -static bool use_line_info = false; +static bool use_return_info = false; +static bool use_args_info = false; +static bool use_file_info = false; +static bool use_line_info = false; // // integral settings // -static bool debug_print = false; -static int expect_error = NO_ERROR; -static int error_print = 0; -static int verbose_level = tim::get_env("TIMEMORY_RUN_VERBOSE", 0); +extern bool debug_print; +extern int verbose_level; // // string settings // @@ -150,7 +144,6 @@ static string_t prefer_library = {}; // global variables // static patch_pointer_t bpatch = {}; -static call_expr_t* initialize_expr = nullptr; static call_expr_t* terminate_expr = nullptr; static snippet_vec_t init_names = {}; static snippet_vec_t fini_names = {}; @@ -161,18 +154,18 @@ static regexvec_t func_include = {}; static regexvec_t func_exclude = {}; static regexvec_t file_include = {}; static regexvec_t file_exclude = {}; -static auto regex_opts = std::regex_constants::egrep | std::regex_constants::optimize; // //======================================================================================// // control debug printf statements #define dprintf(...) \ - if(debug_print || verbose_level > 0) fprintf(stderr, __VA_ARGS__); \ + if(debug_print || verbose_level > 0) \ + fprintf(stderr, "[omnitrace][exe] " __VA_ARGS__); \ fflush(stderr); // control verbose printf statements #define verbprintf(LEVEL, ...) \ - if(verbose_level >= LEVEL) fprintf(stdout, __VA_ARGS__); \ + if(verbose_level >= LEVEL) fprintf(stdout, "[omnitrace][exe] " __VA_ARGS__); \ fflush(stdout); //======================================================================================// @@ -195,6 +188,9 @@ extern "C" //======================================================================================// +strset_t +get_whole_function_names(); + function_signature get_func_file_line_info(module_t* mutatee_module, procedure_t* f); @@ -217,7 +213,7 @@ void errorFunc(error_level_t level, int num, const char** params); procedure_t* -find_function(image_t* appImage, const string_t& functionName, strset_t = {}); +find_function(image_t* appImage, const string_t& functionName, const strset_t& = {}); void error_func_real(error_level_t level, int num, const char* const* params); @@ -242,15 +238,15 @@ get_absolute_path(const char* fname) if(!(p = strrchr((char*) fname, '/'))) { - auto ret = getcwd(abs_exe_path, sizeof(abs_exe_path)); + auto* ret = getcwd(abs_exe_path, sizeof(abs_exe_path)); consume_parameters(ret); } else { - auto rets = getcwd(path_save, sizeof(path_save)); - auto retf = chdir(fname); - auto reta = getcwd(abs_exe_path, sizeof(abs_exe_path)); - auto retp = chdir(path_save); + auto* rets = getcwd(path_save, sizeof(path_save)); + auto retf = chdir(fname); + auto* reta = getcwd(abs_exe_path, sizeof(abs_exe_path)); + auto retp = chdir(path_save); consume_parameters(rets, retf, reta, retp); } return string_t(abs_exe_path); @@ -285,34 +281,32 @@ struct function_signature TIMEMORY_DEFAULT_OBJECT(function_signature) - function_signature(string_t _ret, string_t _name, string_t _file, + function_signature(string_t _ret, const string_t& _name, string_t _file, location_t _row = { 0, 0 }, location_t _col = { 0, 0 }, bool _loop = false, bool _info_beg = false, bool _info_end = false) : m_loop(_loop) , m_info_beg(_info_beg) , m_info_end(_info_end) - , m_row(_row) - , m_col(_col) - , m_return(_ret) + , m_row(std::move(_row)) + , m_col(std::move(_col)) + , m_return(std::move(_ret)) , m_name(tim::demangle(_name)) - , m_file(_file) + , m_file(std::move(_file)) { if(m_file.find('/') != string_t::npos) m_file = m_file.substr(m_file.find_last_of('/') + 1); } - function_signature(string_t _ret, string_t _name, string_t _file, - std::vector _params, location_t _row = { 0, 0 }, - location_t _col = { 0, 0 }, bool _loop = false, + function_signature(const string_t& _ret, const string_t& _name, const string_t& _file, + const std::vector& _params, location_t&& _row = { 0, 0 }, + location_t&& _col = { 0, 0 }, bool _loop = false, bool _info_beg = false, bool _info_end = false) : function_signature(_ret, _name, _file, _row, _col, _loop, _info_beg, _info_end) { - std::stringstream ss; - ss << "("; - for(auto& itr : _params) - ss << itr << ", "; - m_params = ss.str(); - m_params = m_params.substr(0, m_params.length() - 2); + m_params = "("; + for(const auto& itr : _params) + m_params.append(itr + ", "); + if(!_params.empty()) m_params = m_params.substr(0, m_params.length() - 2); m_params += ")"; } @@ -373,11 +367,11 @@ struct module_function get_width()[2] = std::max(get_width()[2], rhs.signature.get().length()); } - module_function(const string_t& _module, const string_t& _func, - const function_signature& _sign, procedure_t* proc) - : module(_module) - , function(_func) - , signature(_sign) + module_function(string_t _module, string_t _func, function_signature _sign, + procedure_t* proc) + : module(std::move(_module)) + , function(std::move(_func)) + , signature(std::move(_sign)) { if(proc) { @@ -482,35 +476,43 @@ dump_info(std::ostream& _os, const fmodset_t& _data) } // static inline void -dump_info(const string_t& _oname, const fmodset_t& _data, int _level) +dump_info(const string_t& _oname, const fmodset_t& _data, int _level, bool _fail) { if(!debug_print && verbose_level < _level) return; - std::ofstream ofs(_oname); + std::ofstream ofs{ _oname }; if(ofs) { verbprintf(_level, "Dumping '%s'... ", _oname.c_str()); dump_info(ofs, _data); verbprintf(_level, "Done\n"); } + else + { + std::stringstream _msg{}; + _msg << "[" << __FUNCTION__ << "] Error opening '" << _oname << " for output"; + verbprintf(_level, "%s\n", _msg.str().c_str()); + if(_fail) throw std::runtime_error(_msg.str()); + } ofs.close(); } // //======================================================================================// // -template +template ::value, int> = 0> snippet_pointer_t get_snippet(Tp arg) { - return snippet_pointer_t(new const_expr_t(arg)); + return std::make_shared(const_expr_t{ arg }); } // //======================================================================================// // -inline snippet_pointer_t -get_snippet(string_t arg) +template ::value, int> = 0> +snippet_pointer_t +get_snippet(const Tp& arg) { - return snippet_pointer_t(new const_expr_t(arg.c_str())); + return std::make_shared(const_expr_t{ arg.c_str() }); } // //======================================================================================// @@ -519,7 +521,7 @@ template snippet_pointer_vec_t get_snippets(Args&&... args) { - snippet_pointer_vec_t _tmp; + snippet_pointer_vec_t _tmp{}; TIMEMORY_FOLD_EXPRESSION(_tmp.push_back(get_snippet(std::forward(args)))); return _tmp; } @@ -587,8 +589,8 @@ private: //======================================================================================// // static inline address_space_t* -omnitrace_get_address_space(patch_pointer_t _bpatch, int _cmdc, char** _cmdv, - bool _rewrite, int _pid = -1, string_t _name = {}) +omnitrace_get_address_space(patch_pointer_t& _bpatch, int _cmdc, char** _cmdv, + bool _rewrite, int _pid = -1, const string_t& _name = {}) { address_space_t* mutatee = nullptr; @@ -599,7 +601,8 @@ omnitrace_get_address_space(patch_pointer_t _bpatch, int _cmdc, char** _cmdv, if(!_name.empty()) mutatee = _bpatch->openBinary(_name.c_str(), false); if(!mutatee) { - fprintf(stderr, "[omnitrace]> Failed to open binary '%s'\n", _name.c_str()); + fprintf(stderr, "[omnitrace][exe] Failed to open binary '%s'\n", + _name.c_str()); throw std::runtime_error("Failed to open binary"); } verbprintf(1, "Done\n"); @@ -612,7 +615,8 @@ omnitrace_get_address_space(patch_pointer_t _bpatch, int _cmdc, char** _cmdv, mutatee = _bpatch->processAttach(_cmdv0, _pid); if(!mutatee) { - fprintf(stderr, "[omnitrace]> Failed to connect to process %i\n", (int) _pid); + fprintf(stderr, "[omnitrace][exe] Failed to connect to process %i\n", + (int) _pid); throw std::runtime_error("Failed to attach to process"); } verbprintf(1, "Done\n"); @@ -630,7 +634,7 @@ omnitrace_get_address_space(patch_pointer_t _bpatch, int _cmdc, char** _cmdv, if(!_cmdv[i]) continue; ss << _cmdv[i] << " "; } - fprintf(stderr, "[omnitrace]> Failed to create process: '%s'\n", + fprintf(stderr, "[omnitrace][exe] Failed to create process: '%s'\n", ss.str().c_str()); throw std::runtime_error("Failed to create process"); } @@ -651,7 +655,7 @@ omnitrace_thread_exit(thread_t* thread, BPatch_exitType exit_type) if(!terminate_expr) { - fprintf(stderr, "[omnitrace]> continuing execution\n"); + fprintf(stderr, "[omnitrace][exe] continuing execution\n"); app->continueExecution(); return; } @@ -660,18 +664,18 @@ omnitrace_thread_exit(thread_t* thread, BPatch_exitType exit_type) { case ExitedNormally: { - fprintf(stderr, "[omnitrace]> Thread exited normally\n"); + fprintf(stderr, "[omnitrace][exe] Thread exited normally\n"); break; } case ExitedViaSignal: { - fprintf(stderr, "[omnitrace]> Thread terminated unexpectedly\n"); + fprintf(stderr, "[omnitrace][exe] Thread terminated unexpectedly\n"); break; } case NoExit: default: { - fprintf(stderr, "[omnitrace]> %s invoked with NoExit\n", __FUNCTION__); + fprintf(stderr, "[omnitrace][exe] %s invoked with NoExit\n", __FUNCTION__); break; } } @@ -679,7 +683,7 @@ omnitrace_thread_exit(thread_t* thread, BPatch_exitType exit_type) // terminate_expr = nullptr; thread->oneTimeCode(*terminate_expr); - fprintf(stderr, "[omnitrace]> continuing execution\n"); + fprintf(stderr, "[omnitrace][exe] continuing execution\n"); app->continueExecution(); } // @@ -703,7 +707,7 @@ omnitrace_fork_callback(thread_t* parent, thread_t* child) if(parent) { - auto app = parent->getProcess(); + auto* app = parent->getProcess(); if(app) { verbprintf(4, "Continuing execution on parent after fork callback...\n"); diff --git a/projects/rocprofiler-systems/scripts/.LICENSE.hpp b/projects/rocprofiler-systems/scripts/.LICENSE.hpp index 3028818331..00e54efb2f 100644 --- a/projects/rocprofiler-systems/scripts/.LICENSE.hpp +++ b/projects/rocprofiler-systems/scripts/.LICENSE.hpp @@ -1,29 +1,23 @@ -// Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal -// with the Software without restriction, including without limitation the -// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -// sell copies of the Software, and to permit persons to whom the Software is +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in the -// documentation and/or other materials provided with the distribution. -// -// * Neither the names of Advanced Micro Devices, Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this Software without specific prior written permission. +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH -// THE SOFTWARE. +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. #pragma once diff --git a/projects/rocprofiler-systems/scripts/build-release.sh b/projects/rocprofiler-systems/scripts/build-release.sh index d7837863c3..efb8d2edff 100755 --- a/projects/rocprofiler-systems/scripts/build-release.sh +++ b/projects/rocprofiler-systems/scripts/build-release.sh @@ -1,15 +1,68 @@ -#!/bin/bash +#!/bin/bash -e +: ${EXTRA_ARGS:=""} +: ${EXTRA_TAGS:=""} : ${VERSION:=0.0.3} : ${ROCM_VERSION:=4.3.0} : ${NJOBS:=8} -STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DOMNITRACE_BUILD_DYNINST=ON -DTIMEMORY_BUILD_PORTABLE=ON" +STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 -DOMNITRACE_BUILD_TESTING=OFF -DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON" +STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_BUILD_DYNINST=ON $(echo -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON)" +if [ -n "${EXTRA_ARGS}" ]; then + STANDARD_ARGS="${STANDARD_ARGS} ${EXTRA_ARGS}" +fi -cmake -B build-release/core ${STANDARD_ARGS} -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_ROCTRACER=OFF -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 . -cmake --build build-release/core --target package --parallel 8 -cp build-release/core/omnitrace-${VERSION}-Linux.sh build-release/omnitrace-${VERSION}-Linux.sh +PACKAGE_BASE_TAG=omnitrace-${VERSION}-Linux +if [ -n "${EXTRA_TAGS}" ]; then + PACKAGE_BASE_TAG="${PACKAGE_BASE_TAG}-${EXTRA_TAGS}" +fi -cmake -B build-release/rocm-mpi ${STANDARD_ARGS} -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON -DDYNINST_USE_OpenMP=ON -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_ROCTRACER=ON -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 . -cmake --build build-release/rocm-mpi --target package --parallel 8 -cp build-release/rocm-mpi/omnitrace-${VERSION}-Linux.sh build-release/omnitrace-${VERSION}-Linux-ROCm-${ROCM_VERSION}.sh +SCRIPT_DIR=$(realpath $(dirname ${BASH_SOURCE[0]})) +cd $(dirname ${SCRIPT_DIR}) +echo -e "Working directory: $(pwd)" + +umask 000 + +if [ ! -f build-release/${PACKAGE_BASE_TAG}.sh ]; then + cmake -B build-release/core ${STANDARD_ARGS} -DCMAKE_INSTALL_PREFIX=build-release/core/install-release -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_MPI_HEADERS=OFF -DOMNITRACE_USE_ROCTRACER=OFF . + cmake --build build-release/core --target package --parallel ${NJOBS} + cp build-release/core/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}.sh +fi + +apt-get install -y libmpich-dev mpich + +STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_ROCTRACER=ON -DOMNITRACE_USE_MPI_HEADERS=ON -DDYNINST_USE_OpenMP=ON" + +if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh ]; then + cmake -B build-release/rocm-${ROCM_VERSION} -DCMAKE_INSTALL_PREFIX=build-release/rocm-${ROCM_VERSION}/install-release ${STANDARD_ARGS} . + cmake --build build-release/rocm-${ROCM_VERSION} --target package --parallel ${NJOBS} + cp build-release/rocm-${ROCM_VERSION}/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh +fi + +apt-get install -y libpapi-dev libpfm4-dev + +STANDARD_ARGS="${STANDARD_ARGS} -DTIMEMORY_USE_PAPI=ON" + +if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh ]; then + cmake -B build-release/rocm-${ROCM_VERSION}-papi -DCMAKE_INSTALL_PREFIX=build-release/rocm-${ROCM_VERSION}-papi/install-release ${STANDARD_ARGS} . + cmake --build build-release/rocm-${ROCM_VERSION}-papi --target package --parallel ${NJOBS} + cp build-release/rocm-${ROCM_VERSION}-papi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh +fi + +STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_MPI=ON" +apt-get install -y libmpich-dev mpich + +if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh ]; then + cmake -B build-release/rocm-${ROCM_VERSION}-papi-mpich -DCMAKE_INSTALL_PREFIX=build-release/rocm-${ROCM_VERSION}-papi-mpich/install-release ${STANDARD_ARGS} . + cmake --build build-release/rocm-${ROCM_VERSION}-papi-mpich --target package --parallel ${NJOBS} + cp build-release/rocm-${ROCM_VERSION}-papi-mpich/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh +fi + +apt-get purge -y libmpich-dev mpich +apt-get install -y libopenmpi-dev openmpi-bin + +if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh ]; then + cmake -B build-release/rocm-${ROCM_VERSION}-papi-openmpi -DCMAKE_INSTALL_PREFIX=build-release/rocm-${ROCM_VERSION}-papi-openmpi/install-release ${STANDARD_ARGS} . + cmake --build build-release/rocm-${ROCM_VERSION}-papi-openmpi --target package --parallel ${NJOBS} + cp build-release/rocm-${ROCM_VERSION}-papi-openmpi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh +fi diff --git a/projects/rocprofiler-systems/src/avail.cpp b/projects/rocprofiler-systems/src/avail.cpp new file mode 100644 index 0000000000..ee0faf7980 --- /dev/null +++ b/projects/rocprofiler-systems/src/avail.cpp @@ -0,0 +1,1389 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +#include "avail.hpp" + +#include "library/components/fork_gotcha.hpp" +#include "library/components/mpi_gotcha.hpp" +#include "library/components/omnitrace.hpp" +#include "library/components/pthread_gotcha.hpp" +#include "library/components/roctracer.hpp" +#include "library/config.hpp" +#include "library/sampling.hpp" + +#include "timemory/components.hpp" +#include "timemory/components/definition.hpp" +#include "timemory/components/placeholder.hpp" +#include "timemory/components/properties.hpp" +#include "timemory/components/skeletons.hpp" +#include "timemory/mpl/types.hpp" +#include "timemory/timemory.hpp" +#include "timemory/utility/argparse.hpp" +#include "timemory/utility/types.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(TIMEMORY_UNIX) +# include // ioctl() and TIOCGWINSZ +# include // for STDOUT_FILENO +#elif defined(TIMEMORY_WINDOWS) +# include +#endif + +using namespace tim; + +template +using array_t = std::array; +using string_t = std::string; +using stringstream_t = std::stringstream; +using str_vec_t = std::vector; +using info_type_base = std::tuple; + +struct info_type : info_type_base +{ + TIMEMORY_DEFAULT_OBJECT(info_type) + + template + info_type(Args&&... _args) + : info_type_base{ std::forward(_args)... } + {} + + const auto& name() const { return std::get<0>(*this); } + auto is_available() const { return std::get<1>(*this); } + const auto& info() const { return std::get<2>(*this); } + const auto& data_type() const { return info().at(0); } + const auto& enum_type() const { return info().at(1); } + const auto& id_type() const { return info().at(2); } + const auto& id_strings() const { return info().at(3); } + const auto& label() const { return info().at(4); } + const auto& description() const { return info().at(5); } + + bool valid() const { return !name().empty() && info().size() >= 5; } + + bool operator<(const info_type& rhs) const { return name() < rhs.name(); } + bool operator!=(const info_type& rhs) const { return !(*this == rhs); } + bool operator==(const info_type& rhs) const + { + if(info().size() != rhs.info().size()) return false; + for(size_t i = 0; i < info().size(); ++i) + { + if(info().at(i) != rhs.info().at(i)) return false; + } + return name() == rhs.name() && is_available() == rhs.is_available(); + } +}; + +namespace +{ +char global_delim = '|'; +bool markdown = false; +bool alphabetical = false; +bool available_only = false; +bool all_info = false; +bool force_brief = false; +bool debug_msg = false; +bool case_insensitive = false; +int32_t max_width = 0; +int32_t num_cols = 0; +int32_t min_width = 40; +int32_t padding = 4; +str_vec_t regex_keys = {}; +bool regex_hl = false; +constexpr size_t num_component_options = 6; +constexpr size_t num_settings_options = 3; +constexpr size_t num_hw_counter_options = 4; +std::stringstream lerr{}; +} // namespace + +//--------------------------------------------------------------------------------------// + +void +dump_log() +{ + if(debug_msg) + { + std::cerr << lerr.str() << std::flush; + lerr = std::stringstream{}; + } +} + +template +static IntArrayT +compute_max_columns(IntArrayT _widths, BoolArrayT _using); + +string_t +remove(string_t inp, const std::set& entries); + +template +void +write_entry(std::ostream& os, const Tp& _entry, int64_t _w, bool center, bool mark); + +template +string_t +banner(IntArrayT _breaks, std::array _use, char filler = '-', char delim = '|'); + +bool +is_selected(const std::string& line); + +std::string +hl_selected(const std::string& line); + +template +void +write_component_info(std::ostream&, const array_t&, const array_t&, + const array_t&); + +template +void +write_settings_info(std::ostream&, const array_t& = {}, + const array_t& = {}, const array_t& = {}); + +template +void +write_hw_counter_info(std::ostream&, const array_t& = {}, + const array_t& = {}, const array_t& = {}); + +template +struct get_availability; + +//--------------------------------------------------------------------------------------// + +template +struct get_availability +{ + using this_type = get_availability; + using metadata_t = component::metadata; + using property_t = component::properties; + + static info_type get_info(); + auto operator()() const { return get_info(); } +}; + +//--------------------------------------------------------------------------------------// + +template +struct get_availability> +{ + using data_type = std::vector; + + static data_type get_info(data_type& _v) + { + TIMEMORY_FOLD_EXPRESSION(_v.emplace_back(get_availability::get_info())); + return _v; + } + + static data_type get_info() + { + data_type _v{}; + return get_info(_v); + } + + template + decltype(auto) operator()(Args&&... _args) + { + return get_info(std::forward(_args)...); + } +}; + +//--------------------------------------------------------------------------------------// + +template <> +struct get_availability +{ + template + decltype(auto) operator()(tim::type_list, Args&&... _args) const + { + return get_availability>{}(std::forward(_args)...); + } + + template + decltype(auto) operator()(Args&&... _args) const + { + return get_availability>{}(std::forward(_args)...); + } +}; + +//--------------------------------------------------------------------------------------// + +enum +{ + VAL = 0, + ENUM = 1, + LANG = 2, + CID = 3, + FNAME = 4, + DESC = 5, + TOTAL = 6 +}; + +//--------------------------------------------------------------------------------------// + +int +main(int argc, char** argv) +{ + omnitrace::configure_settings(); + + array_t options = { false, false, false, false, false, false }; + array_t fields = {}; + array_t use_mark = {}; + + std::string cols_via{}; + std::tie(num_cols, cols_via) = tim::utility::console::get_columns(); + std::string col_msg = + "(default: " + std::to_string(num_cols) + " [via " + cols_via + "])"; + + fields[VAL] = "VALUE_TYPE"; + fields[ENUM] = "ENUMERATION"; + fields[LANG] = "C++ ALIAS / PYTHON ENUMERATION"; + fields[FNAME] = "FILENAME"; + fields[CID] = "STRING_IDS"; + fields[DESC] = "DESCRIPTION"; + + use_mark[VAL] = true; + use_mark[ENUM] = true; + use_mark[LANG] = true; + use_mark[FNAME] = false; + use_mark[CID] = false; + use_mark[DESC] = false; + + bool include_settings = false; + bool include_components = false; + bool include_hw_counters = false; + + std::string file = {}; + + using parser_t = tim::argparse::argument_parser; + parser_t parser("omnitrace-avail"); + + parser.enable_help(); + parser.set_help_width(40); + parser.add_argument({ "--debug" }, "Enable debug messages") + .max_count(1) + .action([](parser_t& p) { debug_msg = p.get("debug"); }); + parser.add_argument({ "-a", "--all" }, "Print all available info") + .max_count(1) + .action([&](parser_t& p) { + all_info = p.get("all"); + if(all_info) + { + for(auto& itr : options) + itr = true; + options[ENUM] = false; + options[LANG] = false; + include_components = true; + include_settings = true; + include_hw_counters = true; + } + }); + + parser.add_argument({ "" }, ""); + parser.add_argument({ "[CATEGORIES]" }, ""); + parser + .add_argument({ "-S", "--settings", "--print-settings" }, + "Display the runtime settings") + .max_count(1); + parser + .add_argument({ "-C", "--components", "--print-components" }, + "Only display the components data") + .max_count(1); + parser + .add_argument({ "-H", "--hw-counters", "--print-hw-counters" }, + "Write the available hardware counters") + .max_count(1); + + parser.add_argument({ "" }, ""); + parser.add_argument({ "[VIEW OPTIONS]" }, ""); + parser.add_argument({ "-A", "--available" }, "Only display available components") + .max_count(1) + .action([](parser_t& p) { available_only = p.get("available"); }); + parser + .add_argument({ "-r", "--filter" }, + "Filter the output according to provided regex (egrep + " + "case-sensitive) [e.g. -r \"true\"]") + .min_count(1) + .dtype("list of strings") + .action([](parser_t& p) { regex_keys = p.get("filter"); }); + parser.add_argument({ "-i", "--ignore-case" }, "Ignore case when filtering") + .max_count(1) + .dtype("bool") + .action([](parser_t& p) { case_insensitive = p.get("ignore-case"); }); + parser + .add_argument({ "-p", "--hl", "--highlight" }, + "Highlight regex matches (only available on UNIX)") + .max_count(1) + .action([](parser_t&) { regex_hl = true; }); + parser.add_argument({ "--alphabetical" }, "Sort the output alphabetically") + .max_count(1) + .action([](parser_t& p) { alphabetical = p.get("alphabetical"); }); + + parser.add_argument({ "" }, ""); + parser.add_argument({ "[COLUMN OPTIONS]" }, ""); + parser.add_argument({ "-b", "--brief" }, "Suppress availability/value info") + .max_count(1) + .action([](parser_t& p) { force_brief = p.get("brief"); }); + parser.add_argument({ "-d", "--description" }, "Display the component description") + .max_count(1); + parser.add_argument({ "-s", "--string" }, "Display all acceptable string identifiers") + .max_count(1); + parser + .add_argument({ "-v", "--value" }, + "Display the component data storage value type") + .max_count(1); + parser + .add_argument({ "-f", "--filename" }, + "Display the output filename for the component") + .max_count(1); + + parser.add_argument({ "" }, ""); + parser.add_argument({ "[WIDTH OPTIONS]" }, ""); + parser + .add_argument({ "-w", "--width" }, + "if w > 0, truncate any columns greater than this width") + .count(1) + .dtype("int") + .action([](parser_t& p) { max_width = p.get("width"); }); + parser + .add_argument( + { "-c", "--columns" }, + std::string{ "if c > 0, truncate the total width of all the columns to this " + "value. Set '-w 0 -c 0' to remove all truncation" } + + col_msg) + .set_default(num_cols) + .count(1) + .dtype("int") + .action([](parser_t& p) { num_cols = p.get("columns"); }); + + parser.add_argument({ "" }, ""); + parser.add_argument({ "[OUTPUT OPTIONS]" }, ""); + parser.add_argument({ "-O", "--output" }, "Write results to file") + .count(1) + .dtype("filename"); + parser.add_argument({ "-M", "--markdown" }, "Write data in markdown") + .max_count(1) + .action([](parser_t& p) { markdown = p.get("markdown"); }); + + parser.add_positional_argument("REGEX_FILTER").set_default(std::string{}); + + auto err = parser.parse(argc, argv); + + if(parser.exists("help")) + { + parser.print_help(); + return EXIT_SUCCESS; + } + + if(err) + { + std::cerr << err << std::endl; + parser.print_help(); + return EXIT_FAILURE; + } + + std::string _pos_regex{}; + if(parser.get_positional_count() > 0) + { + err = parser.get("REGEX_FILTER", _pos_regex); + if(err) + { + std::cerr << err << std::endl; + parser.print_help(); + return EXIT_FAILURE; + } + } + + if(!_pos_regex.empty()) regex_keys.emplace_back(_pos_regex); + + auto _parser_set_if_exists = [&parser](auto& _var, const std::string& _opt) { + using Tp = decay_t; + if(parser.exists(_opt)) _var = parser.get(_opt); + }; + + _parser_set_if_exists(options[FNAME], "filename"); + _parser_set_if_exists(options[DESC], "description"); + _parser_set_if_exists(options[VAL], "value"); + _parser_set_if_exists(options[CID], "string"); + _parser_set_if_exists(file, "output"); + _parser_set_if_exists(include_components, "components"); + _parser_set_if_exists(include_settings, "settings"); + _parser_set_if_exists(include_hw_counters, "hw-counters"); + + if(!include_components && !include_settings && !include_hw_counters) + include_components = true; + + if(markdown || include_hw_counters) padding = 6; + + std::ostream* os = nullptr; + std::ofstream ofs; + if(!file.empty()) + { + ofs.open(file.c_str()); + if(ofs) + { + os = &ofs; + } + else + { + std::cerr << "Error opening output file: " << file << std::endl; + } + } + + if(!os) os = &std::cout; + + if(include_components) write_component_info(*os, options, use_mark, fields); + + dump_log(); + + if(include_settings) + write_settings_info(*os, { options[VAL], options[LANG], options[DESC] }); + + dump_log(); + + if(include_hw_counters) + write_hw_counter_info(*os, { true, !force_brief, !options[DESC], options[DESC] }); + + dump_log(); + + return 0; +} + +//--------------------------------------------------------------------------------------// + +template +struct enumerated_list; + +template