diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 58db1d2797..6bb8efd007 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -35,6 +35,23 @@ parse: PUBLIC: '*' PRIVATE: '*' INTERFACE: '*' + omnitrace_add_bin_test: + flags: + - WILL_FAIL + kwargs: + NAME: '*' + ARGS: '*' + LABELS: '*' + TARGET: '*' + DEPENDS: '*' + COMMAND: '*' + TIMEOUT: '*' + PROPERTIES: '*' + ENVIRONMENT: '*' + WORKING_DIRECTORY: '*' + PASS_REGULAR_EXPRESSION: '*' + FAIL_REGULAR_EXPRESSION: '*' + SKIP_REGULAR_EXPRESSION: '*' override_spec: {} vartags: [] proptags: [] diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml index 6a16872260..9f4da83578 100644 --- a/.github/workflows/linux-ci.yml +++ b/.github/workflows/linux-ci.yml @@ -28,7 +28,7 @@ jobs: timeout-minutes: 5 run: sudo apt-get update && - sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev ${{ matrix.compiler }} ${{ matrix.mpi }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} && python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' @@ -107,7 +107,7 @@ jobs: with: name: data-files path: | - ${{ github.workspace }}/build/omnitrace-tests-output/*.txt + ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ubuntu-bionic: runs-on: ubuntu-18.04 @@ -202,7 +202,7 @@ jobs: with: name: data-files path: | - ${{ github.workspace }}/build/omnitrace-tests-output/*.txt + ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ubuntu-focal-external: runs-on: ubuntu-20.04 @@ -217,7 +217,7 @@ jobs: timeout-minutes: 5 run: sudo apt-get update && - sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev ${{ matrix.compiler }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev clang libomp-dev ${{ matrix.compiler }} && sudo python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' @@ -321,7 +321,7 @@ jobs: with: name: data-files path: | - ${{ github.workspace }}/build/omnitrace-tests-output/*.txt + ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ubuntu-focal-dyninst-package: runs-on: ubuntu-20.04 @@ -336,7 +336,7 @@ jobs: timeout-minutes: 5 run: sudo apt-get update && - sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.compiler }} && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} && sudo python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' @@ -432,7 +432,7 @@ jobs: with: name: data-files path: | - ${{ github.workspace }}/build/omnitrace-tests-output/*.txt + ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ubuntu-focal-external-rocm: runs-on: ubuntu-20.04 @@ -454,7 +454,7 @@ jobs: sudo wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - && echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | sudo tee /etc/apt/sources.list.d/rocm.list && sudo apt-get update && - sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${{ matrix.mpi }} libpapi-dev && + sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libboost-{atomic,system,thread,date-time,filesystem,timer}-dev libtbb-dev libiberty-dev clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${{ matrix.mpi }} libpapi-dev && sudo python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' diff --git a/CMakeLists.txt b/CMakeLists.txt index 083aa11222..e3b1f4266b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,9 @@ omnitrace_add_option( ${OMNITRACE_USE_HIP}) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" OFF) +omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" OFF) omnitrace_add_option(OMNITRACE_BUILD_DYNINST "Build dyninst from submodule" OFF) +omnitrace_add_option(OMNITRACE_BUILD_EXAMPLES "Enable building the examples" OFF) omnitrace_add_option(OMNITRACE_BUILD_TESTING "Enable building the testing suite" OFF) omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF) omnitrace_add_option(OMNITRACE_BUILD_HIDDEN_VISIBILITY @@ -98,6 +100,12 @@ if(NOT OMNITRACE_USE_HIP) CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) endif() +if(OMNITRACE_BUILD_TESTING) + set(OMNITRACE_BUILD_EXAMPLES + ON + CACHE BOOL "Enable building the examples" FORCE) +endif() + include(ProcessorCount) processorcount(OMNITRACE_PROCESSOR_COUNT) math(EXPR OMNITRACE_THREAD_COUNT "16 * ${OMNITRACE_PROCESSOR_COUNT}") @@ -151,6 +159,11 @@ if(OMNITRACE_BUILD_HIDDEN_VISIBILITY) set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) endif() +if(OMNITRACE_BUILD_TESTING) + include(CTest) + enable_testing() +endif() + # ------------------------------------------------------------------------------# # # library and executables @@ -193,7 +206,7 @@ if(OMNITRACE_BUILD_LTO) omnitrace_restore_variables(LTO VARIABLES CMAKE_INTERPROCEDURAL_OPTIMIZATION) endif() -if(OMNITRACE_BUILD_TESTING) +if(OMNITRACE_BUILD_EXAMPLES) add_subdirectory(examples) endif() @@ -204,9 +217,6 @@ endif() # ------------------------------------------------------------------------------# if(OMNITRACE_BUILD_TESTING) - include(CTest) - enable_testing() - add_subdirectory(tests) endif() diff --git a/cmake/Formatting.cmake b/cmake/Formatting.cmake index 31a5a6790d..3838981c0a 100644 --- a/cmake/Formatting.cmake +++ b/cmake/Formatting.cmake @@ -40,25 +40,45 @@ endmacro() find_program(OMNITRACE_CLANG_FORMAT_EXE NAMES clang-format-11 clang-format-mp-11 clang-format) +find_program(OMNITRACE_CMAKE_FORMAT_EXE NAMES cmake-format) + if(OMNITRACE_CLANG_FORMAT_EXE) file(GLOB_RECURSE sources ${PROJECT_SOURCE_DIR}/source/*.cpp) file(GLOB_RECURSE headers ${PROJECT_SOURCE_DIR}/source/*.hpp - ${PROJECT_SOURCE_DIR}/source/*.hpp.in) + ${PROJECT_SOURCE_DIR}/source/*.hpp.in ${PROJECT_SOURCE_DIR}/source/*.h + ${PROJECT_SOURCE_DIR}/source/*.h.in) file(GLOB_RECURSE examples ${PROJECT_SOURCE_DIR}/examples/*.cpp ${PROJECT_SOURCE_DIR}/examples/*.hpp) - file(GLOB_RECURSE external ${PROJECT_SOURCE_DIR}/examples/lulesh/external/*.cpp - ${PROJECT_SOURCE_DIR}/examples/lulesh/external/*.hpp) + file(GLOB_RECURSE external ${PROJECT_SOURCE_DIR}/examples/lulesh/external/kokkos/*) + file(GLOB_RECURSE cmake_files ${PROJECT_SOURCE_DIR}/source/*CMakeLists.txt + ${PROJECT_SOURCE_DIR}/examples/*CMakeLists.txt + ${PROJECT_SOURCE_DIR}/tests/*CMakeLists.txt ${PROJECT_SOURCE_DIR}/cmake/*.cmake) + list(APPEND cmake_files ${PROJECT_SOURCE_DIR}/CMakeLists.txt) if(external) list(REMOVE_ITEM examples ${external}) + list(REMOVE_ITEM cmake_files ${external}) endif() add_custom_target( - format-omnitrace + format-omnitrace-source ${OMNITRACE_CLANG_FORMAT_EXE} -i ${sources} ${headers} ${examples} - COMMENT "Running C++ formatter ${OMNITRACE_CLANG_FORMAT_EXE}...") + COMMENT "[omnitrace] Running C++ formatter ${OMNITRACE_CLANG_FORMAT_EXE}...") + add_custom_target(format-omnitrace) + add_dependencies(format-omnitrace format-omnitrace-source) if(NOT TARGET format) add_custom_target(format) endif() add_dependencies(format format-omnitrace) + if(OMNITRACE_CMAKE_FORMAT_EXE) + add_custom_target( + format-omnitrace-cmake + ${OMNITRACE_CMAKE_FORMAT_EXE} -i ${cmake_files} + COMMENT "[omnitrace] Running CMake formatter ${OMNITRACE_CMAKE_FORMAT_EXE}..." + ) + if(NOT TARGET format-cmake) + add_custom_target(format-cmake) + endif() + add_dependencies(format-cmake format-omnitrace-cmake) + endif() else() message( AUTHOR_WARNING diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index 5c140fc17d..1a236b7795 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -350,6 +350,9 @@ set(TIMEMORY_USE_GOTCHA set(TIMEMORY_USE_PERFETTO OFF CACHE BOOL "Disable perfetto support in timemory") +set(TIMEMORY_USE_OMPT + ${OMNITRACE_USE_OMPT} + CACHE BOOL "Enable OMPT support in timemory" FORCE) set(TIMEMORY_USE_LIBUNWIND ON CACHE BOOL "Enable libunwind support in timemory") @@ -369,7 +372,9 @@ set(TIMEMORY_BUILD_EXTRA_OPTIMIZATIONS set(TIMEMORY_TLS_MODEL "global-dynamic" CACHE STRING "Thread-local static model" FORCE) - +set(TIMEMORY_MAX_THREADS + "${OMNITRACE_MAX_THREADS}" + CACHE STRING "Max statically-allocated threads" FORCE) set(TIMEMORY_SETTINGS_PREFIX "OMNITRACE_" CACHE STRING "Prefix used for settings and environment variables") @@ -430,19 +435,24 @@ if(NOT TARGET PTL::ptl-shared) omnitrace_save_variables( BUILD_CONFIG - VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS CMAKE_POSITION_INDEPENDENT_CODE - CMAKE_CXX_VISIBILITY_PRESET CMAKE_VISIBILITY_INLINES_HIDDEN) + VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS BUILD_OBJECT_LIBS + CMAKE_POSITION_INDEPENDENT_CODE CMAKE_CXX_VISIBILITY_PRESET + CMAKE_VISIBILITY_INLINES_HIDDEN) - set(BUILD_SHARED_LIBS ON) + set(BUILD_SHARED_LIBS OFF) set(BUILD_STATIC_LIBS OFF) + set(BUILD_OBJECT_LIBS ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(CMAKE_CXX_VISIBILITY_PRESET "hidden") set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) add_subdirectory(external/PTL) omnitrace_restore_variables( BUILD_CONFIG - VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS CMAKE_POSITION_INDEPENDENT_CODE - CMAKE_CXX_VISIBILITY_PRESET CMAKE_VISIBILITY_INLINES_HIDDEN) + VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS BUILD_OBJECT_LIBS + CMAKE_POSITION_INDEPENDENT_CODE CMAKE_CXX_VISIBILITY_PRESET + CMAKE_VISIBILITY_INLINES_HIDDEN) endif() -target_link_libraries(omnitrace-ptl INTERFACE PTL::ptl-shared) +target_sources(omnitrace-ptl INTERFACE $) +target_link_libraries(omnitrace-ptl INTERFACE PTL::ptl-object) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 85a7cbd2fc..e27b43892f 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -7,6 +7,8 @@ set(CMAKE_CXX_VISIBILITY_PRESET "default") add_subdirectory(transpose) add_subdirectory(parallel-overhead) +add_subdirectory(user-api) +add_subdirectory(openmp) option(BUILD_SHARED_LIBS "Build dynamic libraries" ON) diff --git a/examples/lulesh/CMakeLists.txt b/examples/lulesh/CMakeLists.txt index ffe64737ae..131d3637cc 100644 --- a/examples/lulesh/CMakeLists.txt +++ b/examples/lulesh/CMakeLists.txt @@ -55,17 +55,3 @@ if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) endif() - -enable_testing() -if(LULESH_USE_MPI) - add_test( - NAME lulesh - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 8 - $ -i 100 -s 20 -p - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) -else() - add_test( - NAME lulesh - COMMAND $ -i 100 -s 20 -p - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) -endif() diff --git a/examples/openmp/CG/cg.cpp b/examples/openmp/CG/cg.cpp new file mode 100644 index 0000000000..47d178a5d0 --- /dev/null +++ b/examples/openmp/CG/cg.cpp @@ -0,0 +1,1055 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +Authors of the Fortran code: + M. Yarrow + C. Kuszmaul + H. Jin + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff + +------------------------------------------------------------------------------ + +The OpenMP version is a parallel implementation of the serial C++ version +OpenMP version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-OMP + +Authors of the OpenMP code: + Júnior Löff + +*/ + +#include "../common/npb-CPP.hpp" +#include "npbparams.hpp" +#include "omp.h" + +/* + * --------------------------------------------------------------------- + * note: please observe that in the routine conj_grad three + * implementations of the sparse matrix-vector multiply have + * been supplied. the default matrix-vector multiply is not + * loop unrolled. the alternate implementations are unrolled + * to a depth of 2 and unrolled to a depth of 8. please + * experiment with these to find the fastest for your particular + * architecture. if reporting timing results, any of these three may + * be used without penalty. + * --------------------------------------------------------------------- + * class specific parameters: + * it appears here for reference only. + * these are their values, however, this info is imported in the npbparams.h + * include file, which is written by the sys/setparams.c program. + * --------------------------------------------------------------------- + */ +#define NZ (NA * (NONZER + 1) * (NONZER + 1)) +#define NAZ (NA * (NONZER + 1)) +#define T_INIT 0 +#define T_BENCH 1 +#define T_CONJ_GRAD 2 +#define T_LAST 3 + +/* global variables */ +#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION) +static int colidx[NZ]; +static int rowstr[NA + 1]; +static int iv[NA]; +static int arow[NA]; +static int acol[NAZ]; +static double aelt[NAZ]; +static double a[NZ]; +static double x[NA + 2]; +static double z[NA + 2]; +static double p[NA + 2]; +static double q[NA + 2]; +static double r[NA + 2]; +#else +static int(*colidx) = (int*) malloc(sizeof(int) * (NZ)); +static int(*rowstr) = (int*) malloc(sizeof(int) * (NA + 1)); +static int(*iv) = (int*) malloc(sizeof(int) * (NA)); +static int(*arow) = (int*) malloc(sizeof(int) * (NA)); +static int(*acol) = (int*) malloc(sizeof(int) * (NAZ)); +static double(*aelt) = (double*) malloc(sizeof(double) * (NAZ)); +static double(*a) = (double*) malloc(sizeof(double) * (NZ)); +static double(*x) = (double*) malloc(sizeof(double) * (NA + 2)); +static double(*z) = (double*) malloc(sizeof(double) * (NA + 2)); +static double(*p) = (double*) malloc(sizeof(double) * (NA + 2)); +static double(*q) = (double*) malloc(sizeof(double) * (NA + 2)); +static double(*r) = (double*) malloc(sizeof(double) * (NA + 2)); +#endif +static int naa; +static int nzz; +static int firstrow; +static int lastrow; +static int firstcol; +static int lastcol; +static double amult; +static double tran; +static boolean timeron; + +/* function prototypes */ +static void +conj_grad(const int colidx[], const int rowstr[], const double x[], double z[], + const double a[], double p[], double q[], double r[], double* rnorm); +static int +icnvrt(double x, int ipwr2); +static void +makea(int n, int nz, double a[], int colidx[], int rowstr[], int firstrow, int lastrow, + int firstcol, int lastcol, int arow[], int acol[][NONZER + 1], + double aelt[][NONZER + 1], int iv[]); +static void +sparse(double a[], int colidx[], int rowstr[], int n, int nz, int nozer, const int arow[], + int acol[][NONZER + 1], double aelt[][NONZER + 1], int firstrow, int lastrow, + int nzloc[], double rcond, double shift); +static void +sprnvc(int n, int nz, int nn1, double v[], int iv[]); +static void +vecset(int n, double v[], int iv[], int* nzv, int i, double val); + +/* cg */ +int +main(int /*argc*/, char** /*argv*/) +{ +#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION) + printf( + " DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION mode on\n"); +#endif + int i, j, k, it; + double zeta; + double rnorm; + double norm_temp1, norm_temp2; + double t, mflops, tmax; + char class_npb; + boolean verified; + double zeta_verify_value, epsilon, err; + + char* t_names[T_LAST]; + + for(i = 0; i < T_LAST; i++) + { + timer_clear(i); + } + + FILE* fp; + if((fp = fopen("timer.flag", "r")) != nullptr) + { + timeron = TRUE; + t_names[T_INIT] = (char*) "init"; + t_names[T_BENCH] = (char*) "benchmk"; + t_names[T_CONJ_GRAD] = (char*) "conjgd"; + fclose(fp); + } + else + { + timeron = FALSE; + } + + timer_start(T_INIT); + + firstrow = 0; + lastrow = NA - 1; + firstcol = 0; + lastcol = NA - 1; + + if(NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) + { + class_npb = 'S'; + zeta_verify_value = 8.5971775078648; + } + else if(NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) + { + class_npb = 'W'; + zeta_verify_value = 10.362595087124; + } + else if(NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) + { + class_npb = 'A'; + zeta_verify_value = 17.130235054029; + } + else if(NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) + { + class_npb = 'B'; + zeta_verify_value = 22.712745482631; + } + else if(NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) + { + class_npb = 'C'; + zeta_verify_value = 28.973605592845; + } + else if(NA == 1500000 && NONZER == 21 && NITER == 100 && SHIFT == 500.0) + { + class_npb = 'D'; + zeta_verify_value = 52.514532105794; + } + else if(NA == 9000000 && NONZER == 26 && NITER == 100 && SHIFT == 1500.0) + { + class_npb = 'E'; + zeta_verify_value = 77.522164599383; + } + else + { + class_npb = 'U'; + } + + printf("\n\n NAS Parallel Benchmarks 4.1 Parallel C++ version with OpenMP - CG " + "Benchmark\n\n"); + printf(" Size: %11d\n", NA); + printf(" Iterations: %5d\n", NITER); + + naa = NA; + nzz = NZ; + + /* initialize random number generator */ + tran = 314159265.0; + amult = 1220703125.0; + zeta = randlc(&tran, amult); + + makea(naa, nzz, a, colidx, rowstr, firstrow, lastrow, firstcol, lastcol, arow, + (int(*)[NONZER + 1])(void*) acol, (double(*)[NONZER + 1])(void*) aelt, iv); + +/* + * --------------------------------------------------------------------- + * note: as a result of the above call to makea: + * values of j used in indexing rowstr go from 0 --> lastrow-firstrow + * values of colidx which are col indexes go from firstcol --> lastcol + * so: + * shift the col index vals from actual (firstcol --> lastcol) + * to local, i.e., (0 --> lastcol-firstcol) + * --------------------------------------------------------------------- + */ +#pragma omp parallel private(it, i, j, k) + { +#pragma omp for nowait + for(j = 0; j < lastrow - firstrow + 1; j++) + { + for(k = rowstr[j]; k < rowstr[j + 1]; k++) + { + colidx[k] = colidx[k] - firstcol; + } + } + +/* set starting vector to (1, 1, .... 1) */ +#pragma omp for nowait + for(i = 0; i < NA + 1; i++) + { + x[i] = 1.0; + } +#pragma omp for nowait + for(j = 0; j < lastcol - firstcol + 1; j++) + { + q[j] = 0.0; + z[j] = 0.0; + r[j] = 0.0; + p[j] = 0.0; + } + +#pragma omp single + zeta = 0.0; + + /* + * ------------------------------------------------------------------- + * ----> + * do one iteration untimed to init all code and data page tables + * ----> (then reinit, start timing, to niter its) + * -------------------------------------------------------------------*/ + + for(it = 1; it <= 1; it++) + { + /* the call to the conjugate gradient routine */ + conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm); +#pragma omp single + { + norm_temp1 = 0.0; + norm_temp2 = 0.0; + } + +/* + * -------------------------------------------------------------------- + * zeta = shift + 1/(x.z) + * so, first: (x.z) + * also, find norm of z + * so, first: (z.z) + * -------------------------------------------------------------------- + */ +#pragma omp for reduction(+ : norm_temp1, norm_temp2) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + norm_temp1 += x[j] * z[j]; + norm_temp2 += +z[j] * z[j]; + } + +#pragma omp single + norm_temp2 = 1.0 / sqrt(norm_temp2); + +/* normalize z to obtain x */ +#pragma omp for + for(j = 0; j < lastcol - firstcol + 1; j++) + { + x[j] = norm_temp2 * z[j]; + } + + } /* end of do one iteration untimed */ + +/* set starting vector to (1, 1, .... 1) */ +#pragma omp for + for(i = 0; i < NA + 1; i++) + { + x[i] = 1.0; + } + +#pragma omp single + zeta = 0.0; + +#pragma omp master + { + timer_stop(T_INIT); + + printf(" Initialization time = %15.3f seconds\n", timer_read(T_INIT)); + + timer_start(T_BENCH); + } + + /* + * -------------------------------------------------------------------- + * ----> + * main iteration for inverse power method + * ----> + * -------------------------------------------------------------------- + */ + for(it = 1; it <= NITER; it++) + { +/* the call to the conjugate gradient routine */ +#pragma omp master + if(timeron != 0) + { + timer_start(T_CONJ_GRAD); + } + conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm); +#pragma omp master + if(timeron != 0) + { + timer_stop(T_CONJ_GRAD); + } + +#pragma omp single + { + norm_temp1 = 0.0; + norm_temp2 = 0.0; + } + +/* + * -------------------------------------------------------------------- + * zeta = shift + 1/(x.z) + * so, first: (x.z) + * also, find norm of z + * so, first: (z.z) + * -------------------------------------------------------------------- + */ +#pragma omp for reduction(+ : norm_temp1, norm_temp2) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + norm_temp1 += x[j] * z[j]; + norm_temp2 += z[j] * z[j]; + } +#pragma omp single + { + norm_temp2 = 1.0 / sqrt(norm_temp2); + zeta = SHIFT + 1.0 / norm_temp1; + } + +#pragma omp master + { + if(it == 1) + { + printf("\n iteration ||r|| zeta\n"); + } + printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); + } +/* normalize z to obtain x */ +#pragma omp for + for(j = 0; j < lastcol - firstcol + 1; j++) + { + x[j] = norm_temp2 * z[j]; + } + } /* end of main iter inv pow meth */ + } /* end parallel */ + timer_stop(T_BENCH); + + /* + * -------------------------------------------------------------------- + * end of timed section + * -------------------------------------------------------------------- + */ + + t = timer_read(T_BENCH); + + printf(" Benchmark completed\n"); + + epsilon = 1.0e-10; + if(class_npb != 'U') + { + err = fabs(zeta - zeta_verify_value) / zeta_verify_value; + if(err <= epsilon) + { + verified = TRUE; + printf(" VERIFICATION SUCCESSFUL\n"); + printf(" Zeta is %20.13e\n", zeta); + printf(" Error is %20.13e\n", err); + } + else + { + verified = FALSE; + printf(" VERIFICATION FAILED\n"); + printf(" Zeta %20.13e\n", zeta); + printf(" The correct zeta is %20.13e\n", zeta_verify_value); + } + } + else + { + verified = FALSE; + printf(" Problem size unknown\n"); + printf(" NO VERIFICATION PERFORMED\n"); + } + if(t != 0.0) + { + mflops = (double) (2.0 * NITER * NA) * + (3.0 + (double) (NONZER * (NONZER + 1)) + + 25.0 * (5.0 + (double) (NONZER * (NONZER + 1))) + 3.0) / + t / 1000000.0; + } + else + { + mflops = 0.0; + } + setenv("OMP_NUM_THREADS", "1", 0); + c_print_results((char*) "CG", class_npb, NA, 0, 0, NITER, t, mflops, + (char*) " floating point", verified, (char*) NPBVERSION, + (char*) COMPILETIME, (char*) COMPILERVERSION, (char*) LIBVERSION, + std::getenv("OMP_NUM_THREADS"), (char*) CS1, (char*) CS2, (char*) CS3, + (char*) CS4, (char*) CS5, (char*) CS6, (char*) CS7); + + /* + * --------------------------------------------------------------------- + * more timers + * --------------------------------------------------------------------- + */ + if(timeron != 0) + { + tmax = timer_read(T_BENCH); + if(tmax == 0.0) + { + tmax = 1.0; + } + printf(" SECTION Time (secs)\n"); + for(i = 0; i < T_LAST; i++) + { + t = timer_read(i); + if(i == T_INIT) + { + printf(" %8s:%9.3f\n", t_names[i], t); + } + else + { + printf(" %8s:%9.3f (%6.2f%%)\n", t_names[i], t, t * 100.0 / tmax); + if(i == T_CONJ_GRAD) + { + t = tmax - t; + printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest", t, t * 100.0 / tmax); + } + } + } + } + + return 0; +} + +/* + * --------------------------------------------------------------------- + * floating point arrays here are named as in NPB1 spec discussion of + * CG algorithm + * --------------------------------------------------------------------- + */ +static void +conj_grad(const int colidx[], const int rowstr[], const double x[], double z[], + const double a[], double p[], double q[], double r[], double* rnorm) +{ + int j, k; + int cgit, cgitmax; + double alpha, beta, suml; + static double d, sum, rho, rho0; + + cgitmax = 25; +#pragma omp single nowait + { + rho = 0.0; + sum = 0.0; + } +/* initialize the CG algorithm */ +#pragma omp for + for(j = 0; j < naa + 1; j++) + { + q[j] = 0.0; + z[j] = 0.0; + r[j] = x[j]; + p[j] = r[j]; + } + +/* + * -------------------------------------------------------------------- + * rho = r.r + * now, obtain the norm of r: First, sum squares of r elements locally... + * -------------------------------------------------------------------- + */ +#pragma omp for reduction(+ : rho) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + rho += r[j] * r[j]; + } + + /* the conj grad iteration loop */ + for(cgit = 1; cgit <= cgitmax; cgit++) + { + /* + * --------------------------------------------------------------------- + * q = A.p + * the partition submatrix-vector multiply: use workspace w + * --------------------------------------------------------------------- + * + * note: this version of the multiply is actually (slightly: maybe %5) + * faster on the sp2 on 16 nodes than is the unrolled-by-2 version + * below. on the Cray t3d, the reverse is TRUE, i.e., the + * unrolled-by-two version is some 10% faster. + * the unrolled-by-8 version below is significantly faster + * on the Cray t3d - overall speed of code is 1.5 times faster. + */ + +#pragma omp single nowait + { + d = 0.0; + /* + * -------------------------------------------------------------------- + * save a temporary of rho + * -------------------------------------------------------------------- + */ + rho0 = rho; + rho = 0.0; + } + +#pragma omp for nowait + for(j = 0; j < lastrow - firstrow + 1; j++) + { + suml = 0.0; + for(k = rowstr[j]; k < rowstr[j + 1]; k++) + { + suml += a[k] * p[colidx[k]]; + } + q[j] = suml; + } + + /* + * -------------------------------------------------------------------- + * obtain p.q + * -------------------------------------------------------------------- + */ + +#pragma omp for reduction(+ : d) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + d += p[j] * q[j]; + } + + /* + * -------------------------------------------------------------------- + * obtain alpha = rho / (p.q) + * ------------------------------------------------------------------- + */ + alpha = rho0 / d; + + /* + * --------------------------------------------------------------------- + * obtain z = z + alpha*p + * and r = r - alpha*q + * --------------------------------------------------------------------- + */ + +#pragma omp for reduction(+ : rho) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + z[j] += alpha * p[j]; + r[j] -= alpha * q[j]; + + /* + * --------------------------------------------------------------------- + * rho = r.r + * now, obtain the norm of r: first, sum squares of r elements locally... + * --------------------------------------------------------------------- + */ + rho += r[j] * r[j]; + } + + /* + * --------------------------------------------------------------------- + * obtain beta + * --------------------------------------------------------------------- + */ + beta = rho / rho0; + +/* + * --------------------------------------------------------------------- + * p = r + beta*p + * --------------------------------------------------------------------- + */ +#pragma omp for + for(j = 0; j < lastcol - firstcol + 1; j++) + { + p[j] = r[j] + beta * p[j]; + } + } /* end of do cgit=1, cgitmax */ + +/* + * --------------------------------------------------------------------- + * compute residual norm explicitly: ||r|| = ||x - A.z|| + * first, form A.z + * the partition submatrix-vector multiply + * --------------------------------------------------------------------- + */ +#pragma omp for nowait + for(j = 0; j < lastrow - firstrow + 1; j++) + { + suml = 0.0; + for(k = rowstr[j]; k < rowstr[j + 1]; k++) + { + suml += a[k] * z[colidx[k]]; + } + r[j] = suml; + } + +/* + * --------------------------------------------------------------------- + * at this point, r contains A.z + * --------------------------------------------------------------------- + */ +#pragma omp for reduction(+ : sum) + for(j = 0; j < lastcol - firstcol + 1; j++) + { + suml = x[j] - r[j]; + sum += suml * suml; + } +#pragma omp single + *rnorm = sqrt(sum); +} + +/* + * --------------------------------------------------------------------- + * scale a double precision number x in (0,1) by a power of 2 and chop it + * --------------------------------------------------------------------- + */ +static int +icnvrt(double x, int ipwr2) +{ + return (int) (ipwr2 * x); +} + +/* + * --------------------------------------------------------------------- + * generate the test problem for benchmark 6 + * makea generates a sparse matrix with a + * prescribed sparsity distribution + * + * parameter type usage + * + * input + * + * n i number of cols/rows of matrix + * nz i nonzeros as declared array size + * rcond r*8 condition number + * shift r*8 main diagonal shift + * + * output + * + * a r*8 array for nonzeros + * colidx i col indices + * rowstr i row pointers + * + * workspace + * + * iv, arow, acol i + * aelt r*8 + * --------------------------------------------------------------------- + */ +static void +makea(int n, int nz, double a[], int colidx[], int rowstr[], int firstrow, int lastrow, + int firstcol, int lastcol, int arow[], int acol[][NONZER + 1], + double aelt[][NONZER + 1], int iv[]) +{ + (void) firstcol; + (void) lastcol; + + int iouter, ivelt, nzv, nn1; + int ivc[NONZER + 1]; + double vc[NONZER + 1]; + + /* + * -------------------------------------------------------------------- + * nonzer is approximately (int(sqrt(nnza /n))); + * -------------------------------------------------------------------- + * nn1 is the smallest power of two not less than n + * -------------------------------------------------------------------- + */ + nn1 = 1; + do + { + nn1 = 2 * nn1; + } while(nn1 < n); + + /* + * ------------------------------------------------------------------- + * generate nonzero positions and save for the use in sparse + * ------------------------------------------------------------------- + */ + for(iouter = 0; iouter < n; iouter++) + { + nzv = NONZER; + sprnvc(n, nzv, nn1, vc, ivc); + vecset(n, vc, ivc, &nzv, iouter + 1, 0.5); + arow[iouter] = nzv; + for(ivelt = 0; ivelt < nzv; ivelt++) + { + acol[iouter][ivelt] = ivc[ivelt] - 1; + aelt[iouter][ivelt] = vc[ivelt]; + } + } + + /* + * --------------------------------------------------------------------- + * ... make the sparse matrix from list of elements with duplicates + * (iv is used as workspace) + * --------------------------------------------------------------------- + */ + sparse(a, colidx, rowstr, n, nz, NONZER, arow, acol, aelt, firstrow, lastrow, iv, + RCOND, SHIFT); +} + +/* + * --------------------------------------------------------------------- + * rows range from firstrow to lastrow + * the rowstr pointers are defined for nrows = lastrow-firstrow+1 values + * --------------------------------------------------------------------- + */ +static void +sparse(double a[], int colidx[], int rowstr[], int n, int nz, int nozer, const int arow[], + int acol[][NONZER + 1], double aelt[][NONZER + 1], int firstrow, int lastrow, + int nzloc[], double rcond, double shift) +{ + (void) nozer; + int nrows; + + /* + * --------------------------------------------------- + * generate a sparse matrix from a list of + * [col, row, element] tri + * --------------------------------------------------- + */ + int i, j, j1, j2, nza, k, kk, nzrow, jcol; + double size, scale, ratio, va; + boolean goto_40; + + /* + * -------------------------------------------------------------------- + * how many rows of result + * -------------------------------------------------------------------- + */ + nrows = lastrow - firstrow + 1; + + /* + * -------------------------------------------------------------------- + * ...count the number of triples in each row + * -------------------------------------------------------------------- + */ + for(j = 0; j < nrows + 1; j++) + { + rowstr[j] = 0; + } + for(i = 0; i < n; i++) + { + for(nza = 0; nza < arow[i]; nza++) + { + j = acol[i][nza] + 1; + rowstr[j] = rowstr[j] + arow[i]; + } + } + rowstr[0] = 0; + for(j = 1; j < nrows + 1; j++) + { + rowstr[j] = rowstr[j] + rowstr[j - 1]; + } + nza = rowstr[nrows] - 1; + + /* + * --------------------------------------------------------------------- + * ... rowstr(j) now is the location of the first nonzero + * of row j of a + * --------------------------------------------------------------------- + */ + if(nza > nz) + { + printf("Space for matrix elements exceeded in sparse\n"); + printf("nza, nzmax = %d, %d\n", nza, nz); + exit(EXIT_FAILURE); + } + + /* + * --------------------------------------------------------------------- + * ... preload data pages + * --------------------------------------------------------------------- + */ + for(j = 0; j < nrows; j++) + { + for(k = rowstr[j]; k < rowstr[j + 1]; k++) + { + a[k] = 0.0; + colidx[k] = -1; + } + nzloc[j] = 0; + } + + /* + * --------------------------------------------------------------------- + * ... generate actual values by summing duplicates + * --------------------------------------------------------------------- + */ + size = 1.0; + ratio = pow(rcond, (1.0 / (double) (n))); + for(i = 0; i < n; i++) + { + for(nza = 0; nza < arow[i]; nza++) + { + j = acol[i][nza]; + + scale = size * aelt[i][nza]; + for(nzrow = 0; nzrow < arow[i]; nzrow++) + { + jcol = acol[i][nzrow]; + va = aelt[i][nzrow] * scale; + + /* + * -------------------------------------------------------------------- + * ... add the identity * rcond to the generated matrix to bound + * the smallest eigenvalue from below by rcond + * -------------------------------------------------------------------- + */ + if(jcol == j && j == i) + { + va = va + rcond - shift; + } + + goto_40 = FALSE; + for(k = rowstr[j]; k < rowstr[j + 1]; k++) + { + if(colidx[k] > jcol) + { + /* + * ---------------------------------------------------------------- + * ... insert colidx here orderly + * ---------------------------------------------------------------- + */ + for(kk = rowstr[j + 1] - 2; kk >= k; kk--) + { + if(colidx[kk] > -1) + { + a[kk + 1] = a[kk]; + colidx[kk + 1] = colidx[kk]; + } + } + colidx[k] = jcol; + a[k] = 0.0; + goto_40 = TRUE; + break; + } + else if(colidx[k] == -1) + { + colidx[k] = jcol; + goto_40 = TRUE; + break; + } + else if(colidx[k] == jcol) + { + /* + * -------------------------------------------------------------- + * ... mark the duplicated entry + * ------------------------------------------------------------- + */ + nzloc[j] = nzloc[j] + 1; + goto_40 = TRUE; + break; + } + } + if(goto_40 == FALSE) + { + printf("internal error in sparse: i=%d\n", i); + exit(EXIT_FAILURE); + } + a[k] = a[k] + va; + } + } + size = size * ratio; + } + + /* + * --------------------------------------------------------------------- + * ... remove empty entries and generate final results + * --------------------------------------------------------------------- + */ + for(j = 1; j < nrows; j++) + { + nzloc[j] = nzloc[j] + nzloc[j - 1]; + } + + for(j = 0; j < nrows; j++) + { + if(j > 0) + { + j1 = rowstr[j] - nzloc[j - 1]; + } + else + { + j1 = 0; + } + j2 = rowstr[j + 1] - nzloc[j]; + nza = rowstr[j]; + for(k = j1; k < j2; k++) + { + a[k] = a[nza]; + colidx[k] = colidx[nza]; + nza = nza + 1; + } + } + for(j = 1; j < nrows + 1; j++) + { + rowstr[j] = rowstr[j] - nzloc[j - 1]; + } + nza = rowstr[nrows] - 1; +} + +/* + * --------------------------------------------------------------------- + * generate a sparse n-vector (v, iv) + * having nzv nonzeros + * + * mark(i) is set to 1 if position i is nonzero. + * mark is all zero on entry and is reset to all zero before exit + * this corrects a performance bug found by John G. Lewis, caused by + * reinitialization of mark on every one of the n calls to sprnvc + * --------------------------------------------------------------------- + */ +static void +sprnvc(int n, int nz, int nn1, double v[], int iv[]) +{ + int nzv, ii, i; + double vecelt, vecloc; + + nzv = 0; + + while(nzv < nz) + { + vecelt = randlc(&tran, amult); + + /* + * -------------------------------------------------------------------- + * generate an integer between 1 and n in a portable manner + * -------------------------------------------------------------------- + */ + vecloc = randlc(&tran, amult); + i = icnvrt(vecloc, nn1) + 1; + if(i > n) + { + continue; + } + + /* + * -------------------------------------------------------------------- + * was this integer generated already? + * -------------------------------------------------------------------- + */ + boolean was_gen = FALSE; + for(ii = 0; ii < nzv; ii++) + { + if(iv[ii] == i) + { + was_gen = TRUE; + break; + } + } + if(was_gen != 0) + { + continue; + } + v[nzv] = vecelt; + iv[nzv] = i; + nzv = nzv + 1; + } +} + +/* + * -------------------------------------------------------------------- + * set ith element of sparse vector (v, iv) with + * nzv nonzeros to val + * -------------------------------------------------------------------- + */ +static void +vecset(int n, double v[], int iv[], int* nzv, int i, double val) +{ + (void) n; + int k; + boolean set; + + set = FALSE; + for(k = 0; k < *nzv; k++) + { + if(iv[k] == i) + { + v[k] = val; + set = TRUE; + } + } + if(set == FALSE) + { + v[*nzv] = val; + iv[*nzv] = i; + *nzv = *nzv + 1; + } +} diff --git a/examples/openmp/CG/npbparams.hpp b/examples/openmp/CG/npbparams.hpp new file mode 100644 index 0000000000..9f5e551aef --- /dev/null +++ b/examples/openmp/CG/npbparams.hpp @@ -0,0 +1,23 @@ +/* CLASS = B */ +/* + c This file is generated automatically by the setparams utility. + c It sets the number of processors and the class_npb of the NPB + c in this directory. Do not modify it by hand. + */ +#define NA 75000 +#define NONZER 13 +#define NITER 75 +#define SHIFT 60.0 +#define RCOND 1.0e-1 +#define CONVERTDOUBLE FALSE +#define COMPILETIME "01 Mar 2022" +#define NPBVERSION "4.1" +#define LIBVERSION "201511" +#define COMPILERVERSION "11.1.0" +#define CS1 "g++ -std=c++14" +#define CS2 "$(CC)" +#define CS3 "-lm" +#define CS4 "-I../common " +#define CS5 "-O3 -fopenmp -mcmodel=medium" +#define CS6 "-O3 -fopenmp -mcmodel=medium" +#define CS7 "randdp" diff --git a/examples/openmp/CMakeLists.txt b/examples/openmp/CMakeLists.txt new file mode 100644 index 0000000000..4195ccb25e --- /dev/null +++ b/examples/openmp/CMakeLists.txt @@ -0,0 +1,27 @@ +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +file(GLOB common_source ${CMAKE_CURRENT_SOURCE_DIR}/common/*.cpp) +add_library(openmp-common OBJECT ${common_source}) +target_include_directories(openmp-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/common) + +add_executable(openmp-cg ${CMAKE_CURRENT_SOURCE_DIR}/CG/cg.cpp + $) +add_executable(openmp-lu ${CMAKE_CURRENT_SOURCE_DIR}/LU/lu.cpp + $) + +find_program(CLANGXX_EXECUTABLE NAMES clang++) +if(CLANGXX_EXECUTABLE) + target_compile_options(openmp-common PUBLIC -W -Wall -fopenmp=libomp) + target_compile_options(openmp-cg PRIVATE -W -Wall -fopenmp=libomp) + target_link_libraries(openmp-cg PRIVATE omp) + target_compile_options(openmp-lu PRIVATE -W -Wall -fopenmp=libomp) + target_link_libraries(openmp-lu PRIVATE omp) + omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-common) + omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-cg) + omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-lu) +else() + find_package(OpenMP REQUIRED) + target_link_libraries(openmp-common PUBLIC OpenMP::OpenMP_CXX) +endif() diff --git a/examples/openmp/LU/lu.cpp b/examples/openmp/LU/lu.cpp new file mode 100644 index 0000000000..cb0eb2a8fc --- /dev/null +++ b/examples/openmp/LU/lu.cpp @@ -0,0 +1,3563 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +Authors of the Fortran code: + S. Weeratunga + V. Venkatakrishnan + E. Barszcz + M. Yarrow + H. Jin + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff + +------------------------------------------------------------------------------ + +The OpenMP version is a parallel implementation of the serial C++ version +OpenMP version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-OMP + +Authors of the OpenMP code: + Júnior Löff + +*/ + +#include "../common/npb-CPP.hpp" +#include "npbparams.hpp" +#include "omp.h" + +/* + * --------------------------------------------------------------------- + * driver for the performance evaluation of the solver for + * five coupled parabolic/elliptic partial differential equations + * --------------------------------------------------------------------- + * parameters which can be overridden in runtime config file + * isiz1,isiz2,isiz3 give the maximum size + * ipr = 1 to print out verbose information + * omega = 2.0 is correct for all classes + * tolrsd is tolerance levels for steady state residuals + * --------------------------------------------------------------------- + * field variables and residuals + * to improve cache performance, second two dimensions padded by 1 + * for even number sizes only. + * note: corresponding array (called "v") in routines blts, buts, + * and l2norm are similarly padded + * --------------------------------------------------------------------- + */ +#define IPR_DEFAULT 1 +#define OMEGA_DEFAULT 1.2 +#define TOLRSD1_DEF 1.0e-08 +#define TOLRSD2_DEF 1.0e-08 +#define TOLRSD3_DEF 1.0e-08 +#define TOLRSD4_DEF 1.0e-08 +#define TOLRSD5_DEF 1.0e-08 +#define C1 1.40e+00 +#define C2 0.40e+00 +#define C3 1.00e-01 +#define C4 1.00e+00 +#define C5 1.40e+00 +#define T_TOTAL 1 +#define T_RHSX 2 +#define T_RHSY 3 +#define T_RHSZ 4 +#define T_RHS 5 +#define T_JACLD 6 +#define T_BLTS 7 +#define T_JACU 8 +#define T_BUTS 9 +#define T_ADD 10 +#define T_L2NORM 11 +#define T_LAST 11 + +/* global variables */ +#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION) +static double u[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]; +static double rsd[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]; +static double frct[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]; +static double flux[ISIZ1][5]; +static double qs[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1]; +static double rho_i[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1]; +static double a[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5]; +static double b[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5]; +static double c[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5]; +static double d[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5]; +static double ce[13][5]; +#else +static double (*u)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] = + (double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]) + malloc(sizeof(double) * + ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5))); +static double (*rsd)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] = + (double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]) + malloc(sizeof(double) * + ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5))); +static double (*frct)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] = + (double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5]) + malloc(sizeof(double) * + ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5))); +static double (*flux)[5] = (double (*)[5]) malloc(sizeof(double) * ((ISIZ1) * (5))); +static double (*qs)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1] = + (double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1]) + malloc(sizeof(double) * ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1))); +static double (*rho_i)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1] = + (double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1]) + malloc(sizeof(double) * ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1))); +static double (*a)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5]) + malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5))); +static double (*b)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5]) + malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5))); +static double (*c)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5]) + malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5))); +static double (*d)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5]) + malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5))); +static double (*ce)[5] = (double (*)[5]) malloc(sizeof(double) * ((13) * (5))); +#endif +/* grid */ +static double dxi, deta, dzeta; +static double tx1, tx2, tx3; +static double ty1, ty2, ty3; +static double tz1, tz2, tz3; +static int nx, ny, nz; +static int nx0, ny0, nz0; +static int ist, iend; +static int jst, jend; +static int ii1, ii2; +static int ji1, ji2; +static int ki1, ki2; +/* dissipation */ +static double dx1, dx2, dx3, dx4, dx5; +static double dy1, dy2, dy3, dy4, dy5; +static double dz1, dz2, dz3, dz4, dz5; +static double dssp; +/* output control parameters */ +static int ipr, inorm; +/* newton-raphson iteration control parameters */ +static double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc; +static int itmax; +/* timer */ +static double maxtime; +static boolean timeron; + +/* function prototypes */ +void +blts(int nx, int ny, int nz, int k, double omega, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], + double ldz[][ISIZ1 / 2 * 2 + 1][5][5], double ldy[][ISIZ1 / 2 * 2 + 1][5][5], + double ldx[][ISIZ1 / 2 * 2 + 1][5][5], double d[][ISIZ1 / 2 * 2 + 1][5][5], int ist, + int iend, int jst, int jend, int nx0, int ny0); +void +buts(int nx, int ny, int nz, int k, double omega, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], void* pointer_tv, + double d[][ISIZ1 / 2 * 2 + 1][5][5], double udx[][ISIZ1 / 2 * 2 + 1][5][5], + double udy[][ISIZ1 / 2 * 2 + 1][5][5], double udz[][ISIZ1 / 2 * 2 + 1][5][5], + int ist, int iend, int jst, int jend, int nx0, int ny0); +void +domain(); +void +erhs(); +void +error(); +void +exact(int i, int j, int k, double u000ijk[]); +void +jacld(int k); +void +jacu(int k); +void +l2norm(int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], double sum[5]); +void +pintgr(); +void +read_input(); +void +rhs(); +void +setbv(); +void +setcoeff(); +void +setiv(); +void +ssor(int niter); +void +verify(double xcr[], double xce[], double xci, char* class_npb, boolean* verified); + +static boolean flag[ISIZ1 / 2 * 2 + 1]; +static boolean flag2[ISIZ1 / 2 * 2 + 1]; + +/* lu */ +int +main(int, char*[]) +{ +#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION) + printf( + " DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION mode on\n"); +#endif + char class_npb; + boolean verified; + double mflops; + double t, tmax, trecs[T_LAST + 1]; + int i; + char* t_names[T_LAST + 1]; + /* + * --------------------------------------------------------------------- + * setup info for timers + * --------------------------------------------------------------------- + */ + FILE* fp; + if((fp = fopen("timer.flag", "r")) != NULL) + { + timeron = TRUE; + t_names[T_TOTAL] = (char*) "total"; + t_names[T_RHSX] = (char*) "rhsx"; + t_names[T_RHSY] = (char*) "rhsy"; + t_names[T_RHSZ] = (char*) "rhsz"; + t_names[T_RHS] = (char*) "rhs"; + t_names[T_JACLD] = (char*) "jacld"; + t_names[T_BLTS] = (char*) "blts"; + t_names[T_JACU] = (char*) "jacu"; + t_names[T_BUTS] = (char*) "buts"; + t_names[T_ADD] = (char*) "add"; + t_names[T_L2NORM] = (char*) "l2norm"; + fclose(fp); + } + else + { + timeron = FALSE; + } + /* + * --------------------------------------------------------------------- + * read input data + * --------------------------------------------------------------------- + */ + read_input(); + /* + * --------------------------------------------------------------------- + * set up domain sizes + * --------------------------------------------------------------------- + */ + domain(); + /* + * --------------------------------------------------------------------- + * set up coefficients + * --------------------------------------------------------------------- + */ + setcoeff(); + +#pragma omp parallel + { + /* + * --------------------------------------------------------------------- + * set the boundary values for dependent variables + * --------------------------------------------------------------------- + */ + setbv(); + /* + * --------------------------------------------------------------------- + * set the initial values for dependent variables + * --------------------------------------------------------------------- + */ + setiv(); + /* + * --------------------------------------------------------------------- + * compute the forcing term based on prescribed exact solution + * --------------------------------------------------------------------- + */ + erhs(); + } /* end parallel */ + + /* + * --------------------------------------------------------------------- + * perform one SSOR iteration to touch all pages + * --------------------------------------------------------------------- + */ + ssor(1); +#pragma omp parallel + { + /* + * --------------------------------------------------------------------- + * reset the boundary and initial values + * --------------------------------------------------------------------- + */ + setbv(); + setiv(); + } + + /* + * --------------------------------------------------------------------- + * perform the SSOR iterations + * --------------------------------------------------------------------- + */ + ssor(itmax); + /* + * --------------------------------------------------------------------- + * compute the solution error + * --------------------------------------------------------------------- + */ + error(); + /* + * --------------------------------------------------------------------- + * compute the surface integral + * --------------------------------------------------------------------- + */ + pintgr(); + /* + * --------------------------------------------------------------------- + * verification test + * --------------------------------------------------------------------- + */ + verify(rsdnm, errnm, frc, &class_npb, &verified); + mflops = (double) itmax * + (1984.77 * (double) nx0 * (double) ny0 * (double) nz0 - + 10923.3 * pow(((double) (nx0 + ny0 + nz0) / 3.0), 2.0) + + 27770.9 * (double) (nx0 + ny0 + nz0) / 3.0 - 144010.0) / + (maxtime * 1000000.0); + setenv("OMP_NUM_THREADS", "1", 0); + c_print_results((char*) "LU", class_npb, nx0, ny0, nz0, itmax, maxtime, mflops, + (char*) " floating point", verified, (char*) NPBVERSION, + (char*) COMPILETIME, (char*) COMPILERVERSION, (char*) LIBVERSION, + std::getenv("OMP_NUM_THREADS"), (char*) CS1, (char*) CS2, (char*) CS3, + (char*) CS4, (char*) CS5, (char*) CS6, (char*) "(none)"); + /* + * --------------------------------------------------------------------- + * more timers + * --------------------------------------------------------------------- + */ + if(timeron) + { + for(i = 1; i <= T_LAST; i++) + { + trecs[i] = timer_read(i); + } + tmax = maxtime; + if(tmax == 0.0) + { + tmax = 1.0; + } + printf(" SECTION Time (secs)\n"); + for(i = 1; i <= T_LAST; i++) + { + printf(" %-8s:%9.3f (%6.2f%%)\n", t_names[i], trecs[i], + trecs[i] * 100. / tmax); + if(i == T_RHS) + { + t = trecs[T_RHSX] + trecs[T_RHSY] + trecs[T_RHSZ]; + printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-rhs", t, t * 100. / tmax); + t = trecs[i] - t; + printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest-rhs", t, t * 100. / tmax); + } + } + } + return 0; +} + +/* + * --------------------------------------------------------------------- + * compute the regular-sparse, block lower triangular solution: + * v <-- ( L-inv ) * v + * --------------------------------------------------------------------- + * to improve cache performance, second two dimensions padded by 1 + * for even number sizes only. only needed in v. + * --------------------------------------------------------------------- + */ +void +blts(int /*nx*/, int /*ny*/, int /*nz*/, int k, double omega, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], + double ldz[][ISIZ1 / 2 * 2 + 1][5][5], double ldy[][ISIZ1 / 2 * 2 + 1][5][5], + double ldx[][ISIZ1 / 2 * 2 + 1][5][5], double d[][ISIZ1 / 2 * 2 + 1][5][5], int ist, + int iend, int jst, int jend, int /*nx0*/, int /*ny0*/) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, m; + double tmp, tmp1; + double tmat[5][5], tv[5]; + +#pragma omp for nowait schedule(static) + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + v[k][j][i][m] = + v[k][j][i][m] - omega * (ldz[j][i][0][m] * v[k - 1][j][i][0] + + ldz[j][i][1][m] * v[k - 1][j][i][1] + + ldz[j][i][2][m] * v[k - 1][j][i][2] + + ldz[j][i][3][m] * v[k - 1][j][i][3] + + ldz[j][i][4][m] * v[k - 1][j][i][4]); + } + } + } + +#pragma omp for nowait schedule(static) + for(j = jst; j < jend; j++) + { + if(j != jst) + { + while(flag[j - 1] == 0) + { +#pragma omp flush + ; + } + } + if(j != jend - 1) + { + while(flag[j] == 1) + { +#pragma omp flush + ; + } + } + + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + tv[m] = v[k][j][i][m] - omega * (ldy[j][i][0][m] * v[k][j - 1][i][0] + + ldx[j][i][0][m] * v[k][j][i - 1][0] + + ldy[j][i][1][m] * v[k][j - 1][i][1] + + ldx[j][i][1][m] * v[k][j][i - 1][1] + + ldy[j][i][2][m] * v[k][j - 1][i][2] + + ldx[j][i][2][m] * v[k][j][i - 1][2] + + ldy[j][i][3][m] * v[k][j - 1][i][3] + + ldx[j][i][3][m] * v[k][j][i - 1][3] + + ldy[j][i][4][m] * v[k][j - 1][i][4] + + ldx[j][i][4][m] * v[k][j][i - 1][4]); + } + /* + * --------------------------------------------------------------------- + * diagonal block inversion + * + * forward elimination + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + tmat[0][m] = d[j][i][0][m]; + tmat[1][m] = d[j][i][1][m]; + tmat[2][m] = d[j][i][2][m]; + tmat[3][m] = d[j][i][3][m]; + tmat[4][m] = d[j][i][4][m]; + } + /* */ + tmp1 = 1.0 / tmat[0][0]; + tmp = tmp1 * tmat[0][1]; + tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; + tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; + tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; + tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; + tv[1] = tv[1] - tv[0] * tmp; + /* */ + tmp = tmp1 * tmat[0][2]; + tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; + tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; + tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; + tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; + tv[2] = tv[2] - tv[0] * tmp; + /* */ + tmp = tmp1 * tmat[0][3]; + tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; + tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; + tv[3] = tv[3] - tv[0] * tmp; + /* */ + tmp = tmp1 * tmat[0][4]; + tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; + tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; + tv[4] = tv[4] - tv[0] * tmp; + /* */ + tmp1 = 1.0 / tmat[1][1]; + tmp = tmp1 * tmat[1][2]; + tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; + tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; + tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; + tv[2] = tv[2] - tv[1] * tmp; + /* */ + tmp = tmp1 * tmat[1][3]; + tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; + tv[3] = tv[3] - tv[1] * tmp; + /* */ + tmp = tmp1 * tmat[1][4]; + tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; + tv[4] = tv[4] - tv[1] * tmp; + /* */ + tmp1 = 1.0 / tmat[2][2]; + tmp = tmp1 * tmat[2][3]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; + tv[3] = tv[3] - tv[2] * tmp; + /* */ + tmp = tmp1 * tmat[2][4]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; + tv[4] = tv[4] - tv[2] * tmp; + /* */ + tmp1 = 1.0 / tmat[3][3]; + tmp = tmp1 * tmat[3][4]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; + tv[4] = tv[4] - tv[3] * tmp; + /* + * --------------------------------------------------------------------- + * back substitution + * --------------------------------------------------------------------- + */ + v[k][j][i][4] = tv[4] / tmat[4][4]; + tv[3] = tv[3] - tmat[4][3] * v[k][j][i][4]; + v[k][j][i][3] = tv[3] / tmat[3][3]; + tv[2] = tv[2] - tmat[3][2] * v[k][j][i][3] - tmat[4][2] * v[k][j][i][4]; + v[k][j][i][2] = tv[2] / tmat[2][2]; + tv[1] = tv[1] - tmat[2][1] * v[k][j][i][2] - tmat[3][1] * v[k][j][i][3] - + tmat[4][1] * v[k][j][i][4]; + v[k][j][i][1] = tv[1] / tmat[1][1]; + tv[0] = tv[0] - tmat[1][0] * v[k][j][i][1] - tmat[2][0] * v[k][j][i][2] - + tmat[3][0] * v[k][j][i][3] - tmat[4][0] * v[k][j][i][4]; + v[k][j][i][0] = tv[0] / tmat[0][0]; + } + + if(j != jend - 1) flag[j] = 1; + if(j != jst) flag[j - 1] = 0; + } +} + +/* + * --------------------------------------------------------------------- + * compute the regular-sparse, block upper triangular solution: + * v <-- ( U-inv ) * v + * --------------------------------------------------------------------- + * to improve cache performance, second two dimensions padded by 1 + * for even number sizes only. only needed in v. + * --------------------------------------------------------------------- + */ +void +buts(int /*nx*/, int /*ny*/, int /*nz*/, int k, double omega, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], void* pointer_tv, + double d[][ISIZ1 / 2 * 2 + 1][5][5], double udx[][ISIZ1 / 2 * 2 + 1][5][5], + double udy[][ISIZ1 / 2 * 2 + 1][5][5], double udz[][ISIZ1 / 2 * 2 + 1][5][5], + int ist, int iend, int jst, int jend, int /*nx0*/, int /*ny0*/) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + double(*tv)[ISIZ1 / 2 * 2 + 1][5] = (double(*)[ISIZ1 / 2 * 2 + 1][5]) pointer_tv; + int i, j, m; + double tmp, tmp1; + double tmat[5][5]; + +#pragma omp for nowait schedule(static) + for(j = jend - 1; j >= jst; j--) + { + for(i = iend - 1; i >= ist; i--) + { + for(m = 0; m < 5; m++) + { + tv[j][i][m] = omega * (udz[j][i][0][m] * v[k + 1][j][i][0] + + udz[j][i][1][m] * v[k + 1][j][i][1] + + udz[j][i][2][m] * v[k + 1][j][i][2] + + udz[j][i][3][m] * v[k + 1][j][i][3] + + udz[j][i][4][m] * v[k + 1][j][i][4]); + } + } + } + +#pragma omp for nowait schedule(static) + for(j = jend - 1; j >= jst; j--) + { + if(j != jend - 1) + { + while(flag2[j + 1] == 0) + { +#pragma omp flush + ; + } + } + if(j != jst) + { + while(flag2[j] == 1) + { +#pragma omp flush + ; + } + } + + for(i = iend - 1; i >= ist; i--) + { + for(m = 0; m < 5; m++) + { + tv[j][i][m] = tv[j][i][m] + omega * (udy[j][i][0][m] * v[k][j + 1][i][0] + + udx[j][i][0][m] * v[k][j][i + 1][0] + + udy[j][i][1][m] * v[k][j + 1][i][1] + + udx[j][i][1][m] * v[k][j][i + 1][1] + + udy[j][i][2][m] * v[k][j + 1][i][2] + + udx[j][i][2][m] * v[k][j][i + 1][2] + + udy[j][i][3][m] * v[k][j + 1][i][3] + + udx[j][i][3][m] * v[k][j][i + 1][3] + + udy[j][i][4][m] * v[k][j + 1][i][4] + + udx[j][i][4][m] * v[k][j][i + 1][4]); + } + /* + * --------------------------------------------------------------------- + * diagonal block inversion + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + tmat[0][m] = d[j][i][0][m]; + tmat[1][m] = d[j][i][1][m]; + tmat[2][m] = d[j][i][2][m]; + tmat[3][m] = d[j][i][3][m]; + tmat[4][m] = d[j][i][4][m]; + } + /* */ + tmp1 = 1.0 / tmat[0][0]; + tmp = tmp1 * tmat[0][1]; + tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; + tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; + tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; + tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; + tv[j][i][1] = tv[j][i][1] - tv[j][i][0] * tmp; + /* */ + tmp = tmp1 * tmat[0][2]; + tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; + tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; + tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; + tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; + tv[j][i][2] = tv[j][i][2] - tv[j][i][0] * tmp; + /* */ + tmp = tmp1 * tmat[0][3]; + tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; + tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; + tv[j][i][3] = tv[j][i][3] - tv[j][i][0] * tmp; + /* */ + tmp = tmp1 * tmat[0][4]; + tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; + tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; + tv[j][i][4] = tv[j][i][4] - tv[j][i][0] * tmp; + /* */ + tmp1 = 1.0 / tmat[1][1]; + tmp = tmp1 * tmat[1][2]; + tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; + tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; + tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; + tv[j][i][2] = tv[j][i][2] - tv[j][i][1] * tmp; + /* */ + tmp = tmp1 * tmat[1][3]; + tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; + tv[j][i][3] = tv[j][i][3] - tv[j][i][1] * tmp; + /* */ + tmp = tmp1 * tmat[1][4]; + tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; + tv[j][i][4] = tv[j][i][4] - tv[j][i][1] * tmp; + /* */ + tmp1 = 1.0 / tmat[2][2]; + tmp = tmp1 * tmat[2][3]; + tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; + tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; + tv[j][i][3] = tv[j][i][3] - tv[j][i][2] * tmp; + /* */ + tmp = tmp1 * tmat[2][4]; + tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; + tv[j][i][4] = tv[j][i][4] - tv[j][i][2] * tmp; + /* */ + tmp1 = 1.0 / tmat[3][3]; + tmp = tmp1 * tmat[3][4]; + tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; + tv[j][i][4] = tv[j][i][4] - tv[j][i][3] * tmp; + /* + * --------------------------------------------------------------------- + * back substitution + * --------------------------------------------------------------------- + */ + tv[j][i][4] = tv[j][i][4] / tmat[4][4]; + tv[j][i][3] = tv[j][i][3] - tmat[4][3] * tv[j][i][4]; + tv[j][i][3] = tv[j][i][3] / tmat[3][3]; + tv[j][i][2] = + tv[j][i][2] - tmat[3][2] * tv[j][i][3] - tmat[4][2] * tv[j][i][4]; + tv[j][i][2] = tv[j][i][2] / tmat[2][2]; + tv[j][i][1] = tv[j][i][1] - tmat[2][1] * tv[j][i][2] - + tmat[3][1] * tv[j][i][3] - tmat[4][1] * tv[j][i][4]; + tv[j][i][1] = tv[j][i][1] / tmat[1][1]; + tv[j][i][0] = tv[j][i][0] - tmat[1][0] * tv[j][i][1] - + tmat[2][0] * tv[j][i][2] - tmat[3][0] * tv[j][i][3] - + tmat[4][0] * tv[j][i][4]; + tv[j][i][0] = tv[j][i][0] / tmat[0][0]; + v[k][j][i][0] = v[k][j][i][0] - tv[j][i][0]; + v[k][j][i][1] = v[k][j][i][1] - tv[j][i][1]; + v[k][j][i][2] = v[k][j][i][2] - tv[j][i][2]; + v[k][j][i][3] = v[k][j][i][3] - tv[j][i][3]; + v[k][j][i][4] = v[k][j][i][4] - tv[j][i][4]; + } + + if(j != jend - 1) flag2[j + 1] = 0; + if(j != jst) flag2[j] = 1; + } +} + +void +domain() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + nx = nx0; + ny = ny0; + nz = nz0; + /* + * --------------------------------------------------------------------- + * check the sub-domain size + * --------------------------------------------------------------------- + */ + if((nx < 4) || (ny < 4) || (nz < 4)) + { + printf(" SUBDOMAIN SIZE IS TOO SMALL - \n" + " ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS\n" + " SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL\n" + " TO 4 THEY ARE CURRENTLY%3d%3d%3d\n", + nx, ny, nz); + exit(EXIT_FAILURE); + } + if((nx > ISIZ1) || (ny > ISIZ2) || (nz > ISIZ3)) + { + printf(" SUBDOMAIN SIZE IS TOO LARGE - \n" + " ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS\n" + " SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO \n" + " ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE\n" + " CURRENTLYi%4d%4d%4d\n", + nx, ny, nz); + exit(EXIT_FAILURE); + } + /* + * --------------------------------------------------------------------- + * set up the start and end in i and j extents for all processors + * --------------------------------------------------------------------- + */ + ist = 1; + iend = nx - 1; + jst = 1; + jend = ny - 1; + ii1 = 1; + ii2 = nx0 - 1; + ji1 = 1; + ji2 = ny0 - 2; + ki1 = 2; + ki2 = nz0 - 1; +} + +/* + * --------------------------------------------------------------------- + * compute the right hand side based on exact solution + * --------------------------------------------------------------------- + */ +void +erhs() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double xi, eta, zeta; + double q; + double u21, u31, u41; + double tmp; + double u21i, u31i, u41i, u51i; + double u21j, u31j, u41j, u51j; + double u21k, u31k, u41k, u51k; + double u21im1, u31im1, u41im1, u51im1; + double u21jm1, u31jm1, u41jm1, u51jm1; + double u21km1, u31km1, u41km1, u51km1; + double flux[ISIZ1][5]; + +#pragma omp for + for(k = 0; k < nz; k++) + { + for(j = 0; j < ny; j++) + { + for(i = 0; i < nx; i++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = 0.0; + } + } + } + } + +#pragma omp for + for(k = 0; k < nz; k++) + { + zeta = ((double) k) / (nz - 1); + for(j = 0; j < ny; j++) + { + eta = ((double) j) / (ny0 - 1); + for(i = 0; i < nx; i++) + { + xi = ((double) i) / (nx0 - 1); + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + ce[0][m] + + (ce[1][m] + (ce[4][m] + (ce[7][m] + ce[10][m] * xi) * xi) * xi) * + xi + + (ce[2][m] + + (ce[5][m] + (ce[8][m] + ce[11][m] * eta) * eta) * eta) * + eta + + (ce[3][m] + + (ce[6][m] + (ce[9][m] + ce[12][m] * zeta) * zeta) * zeta) * + zeta; + } + } + } + } +/* + * --------------------------------------------------------------------- + * xi-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = 0; i < nx; i++) + { + flux[i][0] = rsd[k][j][i][1]; + u21 = rsd[k][j][i][1] / rsd[k][j][i][0]; + q = 0.50 * + (rsd[k][j][i][1] * rsd[k][j][i][1] + + rsd[k][j][i][2] * rsd[k][j][i][2] + + rsd[k][j][i][3] * rsd[k][j][i][3]) / + rsd[k][j][i][0]; + flux[i][1] = rsd[k][j][i][1] * u21 + C2 * (rsd[k][j][i][4] - q); + flux[i][2] = rsd[k][j][i][2] * u21; + flux[i][3] = rsd[k][j][i][3] * u21; + flux[i][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u21; + } + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - tx2 * (flux[i + 1][m] - flux[i - 1][m]); + } + } + for(i = ist; i < nx; i++) + { + tmp = 1.0 / rsd[k][j][i][0]; + u21i = tmp * rsd[k][j][i][1]; + u31i = tmp * rsd[k][j][i][2]; + u41i = tmp * rsd[k][j][i][3]; + u51i = tmp * rsd[k][j][i][4]; + tmp = 1.0 / rsd[k][j][i - 1][0]; + u21im1 = tmp * rsd[k][j][i - 1][1]; + u31im1 = tmp * rsd[k][j][i - 1][2]; + u41im1 = tmp * rsd[k][j][i - 1][3]; + u51im1 = tmp * rsd[k][j][i - 1][4]; + flux[i][1] = (4.0 / 3.0) * tx3 * (u21i - u21im1); + flux[i][2] = tx3 * (u31i - u31im1); + flux[i][3] = tx3 * (u41i - u41im1); + flux[i][4] = 0.50 * (1.0 - C1 * C5) * tx3 * + ((u21i * u21i + u31i * u31i + u41i * u41i) - + (u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1)) + + (1.0 / 6.0) * tx3 * (u21i * u21i - u21im1 * u21im1) + + C1 * C5 * tx3 * (u51i - u51im1); + } + for(i = ist; i < iend; i++) + { + frct[k][j][i][0] = + frct[k][j][i][0] + dx1 * tx1 * + (rsd[k][j][i - 1][0] - 2.0 * rsd[k][j][i][0] + + rsd[k][j][i + 1][0]); + frct[k][j][i][1] = frct[k][j][i][1] + + tx3 * C3 * C4 * (flux[i + 1][1] - flux[i][1]) + + dx2 * tx1 * + (rsd[k][j][i - 1][1] - 2.0 * rsd[k][j][i][1] + + rsd[k][j][i + 1][1]); + frct[k][j][i][2] = frct[k][j][i][2] + + tx3 * C3 * C4 * (flux[i + 1][2] - flux[i][2]) + + dx3 * tx1 * + (rsd[k][j][i - 1][2] - 2.0 * rsd[k][j][i][2] + + rsd[k][j][i + 1][2]); + frct[k][j][i][3] = frct[k][j][i][3] + + tx3 * C3 * C4 * (flux[i + 1][3] - flux[i][3]) + + dx4 * tx1 * + (rsd[k][j][i - 1][3] - 2.0 * rsd[k][j][i][3] + + rsd[k][j][i + 1][3]); + frct[k][j][i][4] = frct[k][j][i][4] + + tx3 * C3 * C4 * (flux[i + 1][4] - flux[i][4]) + + dx5 * tx1 * + (rsd[k][j][i - 1][4] - 2.0 * rsd[k][j][i][4] + + rsd[k][j][i + 1][4]); + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + frct[k][j][1][m] = + frct[k][j][1][m] - dssp * (+5.0 * rsd[k][j][1][m] - + 4.0 * rsd[k][j][2][m] + rsd[k][j][3][m]); + frct[k][j][2][m] = + frct[k][j][2][m] - + dssp * (-4.0 * rsd[k][j][1][m] + 6.0 * rsd[k][j][2][m] - + 4.0 * rsd[k][j][3][m] + rsd[k][j][4][m]); + } + for(i = 3; i < nx - 3; i++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - + dssp * (rsd[k][j][i - 2][m] - 4.0 * rsd[k][j][i - 1][m] + + 6.0 * rsd[k][j][i][m] - 4.0 * rsd[k][j][i + 1][m] + + rsd[k][j][i + 2][m]); + } + } + for(m = 0; m < 5; m++) + { + frct[k][j][nx - 3][m] = + frct[k][j][nx - 3][m] - + dssp * (rsd[k][j][nx - 5][m] - 4.0 * rsd[k][j][nx - 4][m] + + 6.0 * rsd[k][j][nx - 3][m] - 4.0 * rsd[k][j][nx - 2][m]); + frct[k][j][nx - 2][m] = + frct[k][j][nx - 2][m] - + dssp * (rsd[k][j][nx - 4][m] - 4.0 * rsd[k][j][nx - 3][m] + + 5.0 * rsd[k][j][nx - 2][m]); + } + } + } +/* + * --------------------------------------------------------------------- + * eta-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(i = ist; i < iend; i++) + { + for(j = 0; j < ny; j++) + { + flux[j][0] = rsd[k][j][i][2]; + u31 = rsd[k][j][i][2] / rsd[k][j][i][0]; + q = 0.50 * + (rsd[k][j][i][1] * rsd[k][j][i][1] + + rsd[k][j][i][2] * rsd[k][j][i][2] + + rsd[k][j][i][3] * rsd[k][j][i][3]) / + rsd[k][j][i][0]; + flux[j][1] = rsd[k][j][i][1] * u31; + flux[j][2] = rsd[k][j][i][2] * u31 + C2 * (rsd[k][j][i][4] - q); + flux[j][3] = rsd[k][j][i][3] * u31; + flux[j][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u31; + } + for(j = jst; j < jend; j++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - ty2 * (flux[j + 1][m] - flux[j - 1][m]); + } + } + for(j = jst; j < ny; j++) + { + tmp = 1.0 / rsd[k][j][i][0]; + u21j = tmp * rsd[k][j][i][1]; + u31j = tmp * rsd[k][j][i][2]; + u41j = tmp * rsd[k][j][i][3]; + u51j = tmp * rsd[k][j][i][4]; + tmp = 1.0 / rsd[k][j - 1][i][0]; + u21jm1 = tmp * rsd[k][j - 1][i][1]; + u31jm1 = tmp * rsd[k][j - 1][i][2]; + u41jm1 = tmp * rsd[k][j - 1][i][3]; + u51jm1 = tmp * rsd[k][j - 1][i][4]; + flux[j][1] = ty3 * (u21j - u21jm1); + flux[j][2] = (4.0 / 3.0) * ty3 * (u31j - u31jm1); + flux[j][3] = ty3 * (u41j - u41jm1); + flux[j][4] = 0.50 * (1.0 - C1 * C5) * ty3 * + ((u21j * u21j + u31j * u31j + u41j * u41j) - + (u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1)) + + (1.0 / 6.0) * ty3 * (u31j * u31j - u31jm1 * u31jm1) + + C1 * C5 * ty3 * (u51j - u51jm1); + } + for(j = jst; j < jend; j++) + { + frct[k][j][i][0] = + frct[k][j][i][0] + dy1 * ty1 * + (rsd[k][j - 1][i][0] - 2.0 * rsd[k][j][i][0] + + rsd[k][j + 1][i][0]); + frct[k][j][i][1] = frct[k][j][i][1] + + ty3 * C3 * C4 * (flux[j + 1][1] - flux[j][1]) + + dy2 * ty1 * + (rsd[k][j - 1][i][1] - 2.0 * rsd[k][j][i][1] + + rsd[k][j + 1][i][1]); + frct[k][j][i][2] = frct[k][j][i][2] + + ty3 * C3 * C4 * (flux[j + 1][2] - flux[j][2]) + + dy3 * ty1 * + (rsd[k][j - 1][i][2] - 2.0 * rsd[k][j][i][2] + + rsd[k][j + 1][i][2]); + frct[k][j][i][3] = frct[k][j][i][3] + + ty3 * C3 * C4 * (flux[j + 1][3] - flux[j][3]) + + dy4 * ty1 * + (rsd[k][j - 1][i][3] - 2.0 * rsd[k][j][i][3] + + rsd[k][j + 1][i][3]); + frct[k][j][i][4] = frct[k][j][i][4] + + ty3 * C3 * C4 * (flux[j + 1][4] - flux[j][4]) + + dy5 * ty1 * + (rsd[k][j - 1][i][4] - 2.0 * rsd[k][j][i][4] + + rsd[k][j + 1][i][4]); + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + frct[k][1][i][m] = + frct[k][1][i][m] - dssp * (+5.0 * rsd[k][1][i][m] - + 4.0 * rsd[k][2][i][m] + rsd[k][3][i][m]); + frct[k][2][i][m] = + frct[k][2][i][m] - + dssp * (-4.0 * rsd[k][1][i][m] + 6.0 * rsd[k][2][i][m] - + 4.0 * rsd[k][3][i][m] + rsd[k][4][i][m]); + } + for(j = 3; j < ny - 3; j++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - + dssp * (rsd[k][j - 2][i][m] - 4.0 * rsd[k][j - 1][i][m] + + 6.0 * rsd[k][j][i][m] - 4.0 * rsd[k][j + 1][i][m] + + rsd[k][j + 2][i][m]); + } + } + for(m = 0; m < 5; m++) + { + frct[k][ny - 3][i][m] = + frct[k][ny - 3][i][m] - + dssp * (rsd[k][ny - 5][i][m] - 4.0 * rsd[k][ny - 4][i][m] + + 6.0 * rsd[k][ny - 3][i][m] - 4.0 * rsd[k][ny - 2][i][m]); + frct[k][ny - 2][i][m] = + frct[k][ny - 2][i][m] - + dssp * (rsd[k][ny - 4][i][m] - 4.0 * rsd[k][ny - 3][i][m] + + 5.0 * rsd[k][ny - 2][i][m]); + } + } + } +/* + * --------------------------------------------------------------------- + * zeta-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + for(k = 0; k < nz; k++) + { + flux[k][0] = rsd[k][j][i][3]; + u41 = rsd[k][j][i][3] / rsd[k][j][i][0]; + q = 0.50 * + (rsd[k][j][i][1] * rsd[k][j][i][1] + + rsd[k][j][i][2] * rsd[k][j][i][2] + + rsd[k][j][i][3] * rsd[k][j][i][3]) / + rsd[k][j][i][0]; + flux[k][1] = rsd[k][j][i][1] * u41; + flux[k][2] = rsd[k][j][i][2] * u41; + flux[k][3] = rsd[k][j][i][3] * u41 + C2 * (rsd[k][j][i][4] - q); + flux[k][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u41; + } + for(k = 1; k < nz - 1; k++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - tz2 * (flux[k + 1][m] - flux[k - 1][m]); + } + } + for(k = 1; k < nz; k++) + { + tmp = 1.0 / rsd[k][j][i][0]; + u21k = tmp * rsd[k][j][i][1]; + u31k = tmp * rsd[k][j][i][2]; + u41k = tmp * rsd[k][j][i][3]; + u51k = tmp * rsd[k][j][i][4]; + tmp = 1.0 / rsd[k - 1][j][i][0]; + u21km1 = tmp * rsd[k - 1][j][i][1]; + u31km1 = tmp * rsd[k - 1][j][i][2]; + u41km1 = tmp * rsd[k - 1][j][i][3]; + u51km1 = tmp * rsd[k - 1][j][i][4]; + flux[k][1] = tz3 * (u21k - u21km1); + flux[k][2] = tz3 * (u31k - u31km1); + flux[k][3] = (4.0 / 3.0) * tz3 * (u41k - u41km1); + flux[k][4] = 0.50 * (1.0 - C1 * C5) * tz3 * + ((u21k * u21k + u31k * u31k + u41k * u41k) - + (u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1)) + + (1.0 / 6.0) * tz3 * (u41k * u41k - u41km1 * u41km1) + + C1 * C5 * tz3 * (u51k - u51km1); + } + for(k = 1; k < nz - 1; k++) + { + frct[k][j][i][0] = + frct[k][j][i][0] + dz1 * tz1 * + (rsd[k + 1][j][i][0] - 2.0 * rsd[k][j][i][0] + + rsd[k - 1][j][i][0]); + frct[k][j][i][1] = frct[k][j][i][1] + + tz3 * C3 * C4 * (flux[k + 1][1] - flux[k][1]) + + dz2 * tz1 * + (rsd[k + 1][j][i][1] - 2.0 * rsd[k][j][i][1] + + rsd[k - 1][j][i][1]); + frct[k][j][i][2] = frct[k][j][i][2] + + tz3 * C3 * C4 * (flux[k + 1][2] - flux[k][2]) + + dz3 * tz1 * + (rsd[k + 1][j][i][2] - 2.0 * rsd[k][j][i][2] + + rsd[k - 1][j][i][2]); + frct[k][j][i][3] = frct[k][j][i][3] + + tz3 * C3 * C4 * (flux[k + 1][3] - flux[k][3]) + + dz4 * tz1 * + (rsd[k + 1][j][i][3] - 2.0 * rsd[k][j][i][3] + + rsd[k - 1][j][i][3]); + frct[k][j][i][4] = frct[k][j][i][4] + + tz3 * C3 * C4 * (flux[k + 1][4] - flux[k][4]) + + dz5 * tz1 * + (rsd[k + 1][j][i][4] - 2.0 * rsd[k][j][i][4] + + rsd[k - 1][j][i][4]); + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + frct[1][j][i][m] = + frct[1][j][i][m] - dssp * (+5.0 * rsd[1][j][i][m] - + 4.0 * rsd[2][j][i][m] + rsd[3][j][i][m]); + frct[2][j][i][m] = + frct[2][j][i][m] - + dssp * (-4.0 * rsd[1][j][i][m] + 6.0 * rsd[2][j][i][m] - + 4.0 * rsd[3][j][i][m] + rsd[4][j][i][m]); + } + for(k = 3; k < nz - 3; k++) + { + for(m = 0; m < 5; m++) + { + frct[k][j][i][m] = + frct[k][j][i][m] - + dssp * (rsd[k - 2][j][i][m] - 4.0 * rsd[k - 1][j][i][m] + + 6.0 * rsd[k][j][i][m] - 4.0 * rsd[k + 1][j][i][m] + + rsd[k + 2][j][i][m]); + } + } + for(m = 0; m < 5; m++) + { + frct[nz - 3][j][i][m] = + frct[nz - 3][j][i][m] - + dssp * (rsd[nz - 5][j][i][m] - 4.0 * rsd[nz - 4][j][i][m] + + 6.0 * rsd[nz - 3][j][i][m] - 4.0 * rsd[nz - 2][j][i][m]); + frct[nz - 2][j][i][m] = + frct[nz - 2][j][i][m] - + dssp * (rsd[nz - 4][j][i][m] - 4.0 * rsd[nz - 3][j][i][m] + + 5.0 * rsd[nz - 2][j][i][m]); + } + } + } +} + +/* + * --------------------------------------------------------------------- + * compute the solution error + * --------------------------------------------------------------------- + */ +void +error() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double tmp; + double u000ijk[5]; + for(m = 0; m < 5; m++) + { + errnm[m] = 0.0; + } + for(k = 1; k < nz - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + exact(i, j, k, u000ijk); + for(m = 0; m < 5; m++) + { + tmp = (u000ijk[m] - u[k][j][i][m]); + errnm[m] = errnm[m] + tmp * tmp; + } + } + } + } + for(m = 0; m < 5; m++) + { + errnm[m] = sqrt(errnm[m] / ((nx0 - 2) * (ny0 - 2) * (nz0 - 2))); + } +} + +/* + * --------------------------------------------------------------------- + * compute the exact solution at (i,j,k) + * --------------------------------------------------------------------- + */ +void +exact(int i, int j, int k, double u000ijk[]) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int m; + double xi, eta, zeta; + xi = ((double) i) / (nx0 - 1); + eta = ((double) j) / (ny0 - 1); + zeta = ((double) k) / (nz - 1); + for(m = 0; m < 5; m++) + { + u000ijk[m] = + ce[0][m] + + (ce[1][m] + (ce[4][m] + (ce[7][m] + ce[10][m] * xi) * xi) * xi) * xi + + (ce[2][m] + (ce[5][m] + (ce[8][m] + ce[11][m] * eta) * eta) * eta) * eta + + (ce[3][m] + (ce[6][m] + (ce[9][m] + ce[12][m] * zeta) * zeta) * zeta) * zeta; + } +} + +/* + * --------------------------------------------------------------------- + * compute the lower triangular part of the jacobian matrix + * --------------------------------------------------------------------- + */ +void +jacld(int k) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j; + double r43; + double c1345; + double c34; + double tmp1, tmp2, tmp3; + r43 = (4.0 / 3.0); + c1345 = C1 * C3 * C4 * C5; + c34 = C3 * C4; + +#pragma omp for nowait schedule(static) + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + /* + * --------------------------------------------------------------------- + * form the block daigonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + d[j][i][0][0] = 1.0 + dt * 2.0 * (tx1 * dx1 + ty1 * dy1 + tz1 * dz1); + d[j][i][1][0] = 0.0; + d[j][i][2][0] = 0.0; + d[j][i][3][0] = 0.0; + d[j][i][4][0] = 0.0; + d[j][i][0][1] = + -dt * 2.0 * (tx1 * r43 + ty1 + tz1) * c34 * tmp2 * u[k][j][i][1]; + d[j][i][1][1] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 * r43 + ty1 + tz1) + + dt * 2.0 * (tx1 * dx2 + ty1 * dy2 + tz1 * dz2); + d[j][i][2][1] = 0.0; + d[j][i][3][1] = 0.0; + d[j][i][4][1] = 0.0; + d[j][i][0][2] = + -dt * 2.0 * (tx1 + ty1 * r43 + tz1) * c34 * tmp2 * u[k][j][i][2]; + d[j][i][1][2] = 0.0; + d[j][i][2][2] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 * r43 + tz1) + + dt * 2.0 * (tx1 * dx3 + ty1 * dy3 + tz1 * dz3); + d[j][i][3][2] = 0.0; + d[j][i][4][2] = 0.0; + d[j][i][0][3] = + -dt * 2.0 * (tx1 + ty1 + tz1 * r43) * c34 * tmp2 * u[k][j][i][3]; + d[j][i][1][3] = 0.0; + d[j][i][2][3] = 0.0; + d[j][i][3][3] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 + tz1 * r43) + + dt * 2.0 * (tx1 * dx4 + ty1 * dy4 + tz1 * dz4); + d[j][i][4][3] = 0.0; + d[j][i][0][4] = -dt * 2.0 * + (((tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + + tz1 * (c34 - c1345)) * + (u[k][j][i][1] * u[k][j][i][1]) + + (tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + + tz1 * (c34 - c1345)) * + (u[k][j][i][2] * u[k][j][i][2]) + + (tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + + tz1 * (r43 * c34 - c1345)) * + (u[k][j][i][3] * u[k][j][i][3])) * + tmp3 + + (tx1 + ty1 + tz1) * c1345 * tmp2 * u[k][j][i][4]); + d[j][i][1][4] = + dt * 2.0 * tmp2 * u[k][j][i][1] * + (tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (c34 - c1345)); + d[j][i][2][4] = + dt * 2.0 * tmp2 * u[k][j][i][2] * + (tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + tz1 * (c34 - c1345)); + d[j][i][3][4] = + dt * 2.0 * tmp2 * u[k][j][i][3] * + (tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (r43 * c34 - c1345)); + d[j][i][4][4] = 1.0 + dt * 2.0 * (tx1 + ty1 + tz1) * c1345 * tmp1 + + dt * 2.0 * (tx1 * dx5 + ty1 * dy5 + tz1 * dz5); + /* + * --------------------------------------------------------------------- + * form the first block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k - 1][j][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + a[j][i][0][0] = -dt * tz1 * dz1; + a[j][i][1][0] = 0.0; + a[j][i][2][0] = 0.0; + a[j][i][3][0] = -dt * tz2; + a[j][i][4][0] = 0.0; + a[j][i][0][1] = + -dt * tz2 * (-(u[k - 1][j][i][1] * u[k - 1][j][i][3]) * tmp2) - + dt * tz1 * (-c34 * tmp2 * u[k - 1][j][i][1]); + a[j][i][1][1] = -dt * tz2 * (u[k - 1][j][i][3] * tmp1) - + dt * tz1 * c34 * tmp1 - dt * tz1 * dz2; + a[j][i][2][1] = 0.0; + a[j][i][3][1] = -dt * tz2 * (u[k - 1][j][i][1] * tmp1); + a[j][i][4][1] = 0.0; + a[j][i][0][2] = + -dt * tz2 * (-(u[k - 1][j][i][2] * u[k - 1][j][i][3]) * tmp2) - + dt * tz1 * (-c34 * tmp2 * u[k - 1][j][i][2]); + a[j][i][1][2] = 0.0; + a[j][i][2][2] = -dt * tz2 * (u[k - 1][j][i][3] * tmp1) - + dt * tz1 * (c34 * tmp1) - dt * tz1 * dz3; + a[j][i][3][2] = -dt * tz2 * (u[k - 1][j][i][2] * tmp1); + a[j][i][4][2] = 0.0; + a[j][i][0][3] = + -dt * tz2 * + (-(u[k - 1][j][i][3] * tmp1) * (u[k - 1][j][i][3] * tmp1) + + C2 * qs[k - 1][j][i] * tmp1) - + dt * tz1 * (-r43 * c34 * tmp2 * u[k - 1][j][i][3]); + a[j][i][1][3] = -dt * tz2 * (-C2 * (u[k - 1][j][i][1] * tmp1)); + a[j][i][2][3] = -dt * tz2 * (-C2 * (u[k - 1][j][i][2] * tmp1)); + a[j][i][3][3] = -dt * tz2 * (2.0 - C2) * (u[k - 1][j][i][3] * tmp1) - + dt * tz1 * (r43 * c34 * tmp1) - dt * tz1 * dz4; + a[j][i][4][3] = -dt * tz2 * C2; + a[j][i][0][4] = + -dt * tz2 * + ((C2 * 2.0 * qs[k - 1][j][i] - C1 * u[k - 1][j][i][4]) * + u[k - 1][j][i][3] * tmp2) - + dt * tz1 * + (-(c34 - c1345) * tmp3 * (u[k - 1][j][i][1] * u[k - 1][j][i][1]) - + (c34 - c1345) * tmp3 * (u[k - 1][j][i][2] * u[k - 1][j][i][2]) - + (r43 * c34 - c1345) * tmp3 * + (u[k - 1][j][i][3] * u[k - 1][j][i][3]) - + c1345 * tmp2 * u[k - 1][j][i][4]); + a[j][i][1][4] = + -dt * tz2 * (-C2 * (u[k - 1][j][i][1] * u[k - 1][j][i][3]) * tmp2) - + dt * tz1 * (c34 - c1345) * tmp2 * u[k - 1][j][i][1]; + a[j][i][2][4] = + -dt * tz2 * (-C2 * (u[k - 1][j][i][2] * u[k - 1][j][i][3]) * tmp2) - + dt * tz1 * (c34 - c1345) * tmp2 * u[k - 1][j][i][2]; + a[j][i][3][4] = -dt * tz2 * + (C1 * (u[k - 1][j][i][4] * tmp1) - + C2 * (qs[k - 1][j][i] * tmp1 + + u[k - 1][j][i][3] * u[k - 1][j][i][3] * tmp2)) - + dt * tz1 * (r43 * c34 - c1345) * tmp2 * u[k - 1][j][i][3]; + a[j][i][4][4] = -dt * tz2 * (C1 * (u[k - 1][j][i][3] * tmp1)) - + dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5; + /* + * --------------------------------------------------------------------- + * form the second block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j - 1][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + b[j][i][0][0] = -dt * ty1 * dy1; + b[j][i][1][0] = 0.0; + b[j][i][2][0] = -dt * ty2; + b[j][i][3][0] = 0.0; + b[j][i][4][0] = 0.0; + b[j][i][0][1] = + -dt * ty2 * (-(u[k][j - 1][i][1] * u[k][j - 1][i][2]) * tmp2) - + dt * ty1 * (-c34 * tmp2 * u[k][j - 1][i][1]); + b[j][i][1][1] = -dt * ty2 * (u[k][j - 1][i][2] * tmp1) - + dt * ty1 * (c34 * tmp1) - dt * ty1 * dy2; + b[j][i][2][1] = -dt * ty2 * (u[k][j - 1][i][1] * tmp1); + b[j][i][3][1] = 0.0; + b[j][i][4][1] = 0.0; + b[j][i][0][2] = + -dt * ty2 * + (-(u[k][j - 1][i][2] * tmp1) * (u[k][j - 1][i][2] * tmp1) + + C2 * (qs[k][j - 1][i] * tmp1)) - + dt * ty1 * (-r43 * c34 * tmp2 * u[k][j - 1][i][2]); + b[j][i][1][2] = -dt * ty2 * (-C2 * (u[k][j - 1][i][1] * tmp1)); + b[j][i][2][2] = -dt * ty2 * ((2.0 - C2) * (u[k][j - 1][i][2] * tmp1)) - + dt * ty1 * (r43 * c34 * tmp1) - dt * ty1 * dy3; + b[j][i][3][2] = -dt * ty2 * (-C2 * (u[k][j - 1][i][3] * tmp1)); + b[j][i][4][2] = -dt * ty2 * C2; + b[j][i][0][3] = + -dt * ty2 * (-(u[k][j - 1][i][2] * u[k][j - 1][i][3]) * tmp2) - + dt * ty1 * (-c34 * tmp2 * u[k][j - 1][i][3]); + b[j][i][1][3] = 0.0; + b[j][i][2][3] = -dt * ty2 * (u[k][j - 1][i][3] * tmp1); + b[j][i][3][3] = -dt * ty2 * (u[k][j - 1][i][2] * tmp1) - + dt * ty1 * (c34 * tmp1) - dt * ty1 * dy4; + b[j][i][4][3] = 0.0; + b[j][i][0][4] = + -dt * ty2 * + ((C2 * 2.0 * qs[k][j - 1][i] - C1 * u[k][j - 1][i][4]) * + (u[k][j - 1][i][2] * tmp2)) - + dt * ty1 * + (-(c34 - c1345) * tmp3 * (u[k][j - 1][i][1] * u[k][j - 1][i][1]) - + (r43 * c34 - c1345) * tmp3 * + (u[k][j - 1][i][2] * u[k][j - 1][i][2]) - + (c34 - c1345) * tmp3 * (u[k][j - 1][i][3] * u[k][j - 1][i][3]) - + c1345 * tmp2 * u[k][j - 1][i][4]); + b[j][i][1][4] = + -dt * ty2 * (-C2 * (u[k][j - 1][i][1] * u[k][j - 1][i][2]) * tmp2) - + dt * ty1 * (c34 - c1345) * tmp2 * u[k][j - 1][i][1]; + b[j][i][2][4] = -dt * ty2 * + (C1 * (u[k][j - 1][i][4] * tmp1) - + C2 * (qs[k][j - 1][i] * tmp1 + + u[k][j - 1][i][2] * u[k][j - 1][i][2] * tmp2)) - + dt * ty1 * (r43 * c34 - c1345) * tmp2 * u[k][j - 1][i][2]; + b[j][i][3][4] = + -dt * ty2 * (-C2 * (u[k][j - 1][i][2] * u[k][j - 1][i][3]) * tmp2) - + dt * ty1 * (c34 - c1345) * tmp2 * u[k][j - 1][i][3]; + b[j][i][4][4] = -dt * ty2 * (C1 * (u[k][j - 1][i][2] * tmp1)) - + dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5; + /* + * --------------------------------------------------------------------- + * form the third block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j][i - 1]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + c[j][i][0][0] = -dt * tx1 * dx1; + c[j][i][1][0] = -dt * tx2; + c[j][i][2][0] = 0.0; + c[j][i][3][0] = 0.0; + c[j][i][4][0] = 0.0; + c[j][i][0][1] = + -dt * tx2 * + (-(u[k][j][i - 1][1] * tmp1) * (u[k][j][i - 1][1] * tmp1) + + C2 * qs[k][j][i - 1] * tmp1) - + dt * tx1 * (-r43 * c34 * tmp2 * u[k][j][i - 1][1]); + c[j][i][1][1] = -dt * tx2 * ((2.0 - C2) * (u[k][j][i - 1][1] * tmp1)) - + dt * tx1 * (r43 * c34 * tmp1) - dt * tx1 * dx2; + c[j][i][2][1] = -dt * tx2 * (-C2 * (u[k][j][i - 1][2] * tmp1)); + c[j][i][3][1] = -dt * tx2 * (-C2 * (u[k][j][i - 1][3] * tmp1)); + c[j][i][4][1] = -dt * tx2 * C2; + c[j][i][0][2] = + -dt * tx2 * (-(u[k][j][i - 1][1] * u[k][j][i - 1][2]) * tmp2) - + dt * tx1 * (-c34 * tmp2 * u[k][j][i - 1][2]); + c[j][i][1][2] = -dt * tx2 * (u[k][j][i - 1][2] * tmp1); + c[j][i][2][2] = -dt * tx2 * (u[k][j][i - 1][1] * tmp1) - + dt * tx1 * (c34 * tmp1) - dt * tx1 * dx3; + c[j][i][3][2] = 0.0; + c[j][i][4][2] = 0.0; + c[j][i][0][3] = + -dt * tx2 * (-(u[k][j][i - 1][1] * u[k][j][i - 1][3]) * tmp2) - + dt * tx1 * (-c34 * tmp2 * u[k][j][i - 1][3]); + c[j][i][1][3] = -dt * tx2 * (u[k][j][i - 1][3] * tmp1); + c[j][i][2][3] = 0.0; + c[j][i][3][3] = -dt * tx2 * (u[k][j][i - 1][1] * tmp1) - + dt * tx1 * (c34 * tmp1) - dt * tx1 * dx4; + c[j][i][4][3] = 0.0; + c[j][i][0][4] = + -dt * tx2 * + ((C2 * 2.0 * qs[k][j][i - 1] - C1 * u[k][j][i - 1][4]) * + u[k][j][i - 1][1] * tmp2) - + dt * tx1 * + (-(r43 * c34 - c1345) * tmp3 * + (u[k][j][i - 1][1] * u[k][j][i - 1][1]) - + (c34 - c1345) * tmp3 * (u[k][j][i - 1][2] * u[k][j][i - 1][2]) - + (c34 - c1345) * tmp3 * (u[k][j][i - 1][3] * u[k][j][i - 1][3]) - + c1345 * tmp2 * u[k][j][i - 1][4]); + c[j][i][1][4] = -dt * tx2 * + (C1 * (u[k][j][i - 1][4] * tmp1) - + C2 * (u[k][j][i - 1][1] * u[k][j][i - 1][1] * tmp2 + + qs[k][j][i - 1] * tmp1)) - + dt * tx1 * (r43 * c34 - c1345) * tmp2 * u[k][j][i - 1][1]; + c[j][i][2][4] = + -dt * tx2 * (-C2 * (u[k][j][i - 1][2] * u[k][j][i - 1][1]) * tmp2) - + dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i - 1][2]; + c[j][i][3][4] = + -dt * tx2 * (-C2 * (u[k][j][i - 1][3] * u[k][j][i - 1][1]) * tmp2) - + dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i - 1][3]; + c[j][i][4][4] = -dt * tx2 * (C1 * (u[k][j][i - 1][1] * tmp1)) - + dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5; + } + } +} + +/* + * --------------------------------------------------------------------- + * compute the upper triangular part of the jacobian matrix + * --------------------------------------------------------------------- + */ +void +jacu(int k) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j; + double r43; + double c1345; + double c34; + double tmp1, tmp2, tmp3; + r43 = (4.0 / 3.0); + c1345 = C1 * C3 * C4 * C5; + c34 = C3 * C4; + +#pragma omp for nowait schedule(static) + for(j = jend - 1; j >= jst; j--) + { + for(i = iend - 1; i >= ist; i--) + { + /* + * --------------------------------------------------------------------- + * form the block daigonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + d[j][i][0][0] = 1.0 + dt * 2.0 * (tx1 * dx1 + ty1 * dy1 + tz1 * dz1); + d[j][i][1][0] = 0.0; + d[j][i][2][0] = 0.0; + d[j][i][3][0] = 0.0; + d[j][i][4][0] = 0.0; + d[j][i][0][1] = + dt * 2.0 * (-tx1 * r43 - ty1 - tz1) * (c34 * tmp2 * u[k][j][i][1]); + d[j][i][1][1] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 * r43 + ty1 + tz1) + + dt * 2.0 * (tx1 * dx2 + ty1 * dy2 + tz1 * dz2); + d[j][i][2][1] = 0.0; + d[j][i][3][1] = 0.0; + d[j][i][4][1] = 0.0; + d[j][i][0][2] = + dt * 2.0 * (-tx1 - ty1 * r43 - tz1) * (c34 * tmp2 * u[k][j][i][2]); + d[j][i][1][2] = 0.0; + d[j][i][2][2] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 * r43 + tz1) + + dt * 2.0 * (tx1 * dx3 + ty1 * dy3 + tz1 * dz3); + d[j][i][3][2] = 0.0; + d[j][i][4][2] = 0.0; + d[j][i][0][3] = + dt * 2.0 * (-tx1 - ty1 - tz1 * r43) * (c34 * tmp2 * u[k][j][i][3]); + d[j][i][1][3] = 0.0; + d[j][i][2][3] = 0.0; + d[j][i][3][3] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 + tz1 * r43) + + dt * 2.0 * (tx1 * dx4 + ty1 * dy4 + tz1 * dz4); + d[j][i][4][3] = 0.0; + d[j][i][0][4] = -dt * 2.0 * + (((tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + + tz1 * (c34 - c1345)) * + (u[k][j][i][1] * u[k][j][i][1]) + + (tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + + tz1 * (c34 - c1345)) * + (u[k][j][i][2] * u[k][j][i][2]) + + (tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + + tz1 * (r43 * c34 - c1345)) * + (u[k][j][i][3] * u[k][j][i][3])) * + tmp3 + + (tx1 + ty1 + tz1) * c1345 * tmp2 * u[k][j][i][4]); + d[j][i][1][4] = + dt * 2.0 * + (tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (c34 - c1345)) * + tmp2 * u[k][j][i][1]; + d[j][i][2][4] = + dt * 2.0 * + (tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + tz1 * (c34 - c1345)) * + tmp2 * u[k][j][i][2]; + d[j][i][3][4] = + dt * 2.0 * + (tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (r43 * c34 - c1345)) * + tmp2 * u[k][j][i][3]; + d[j][i][4][4] = 1.0 + dt * 2.0 * (tx1 + ty1 + tz1) * c1345 * tmp1 + + dt * 2.0 * (tx1 * dx5 + ty1 * dy5 + tz1 * dz5); + /* + * --------------------------------------------------------------------- + * form the first block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j][i + 1]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + a[j][i][0][0] = -dt * tx1 * dx1; + a[j][i][1][0] = dt * tx2; + a[j][i][2][0] = 0.0; + a[j][i][3][0] = 0.0; + a[j][i][4][0] = 0.0; + a[j][i][0][1] = + dt * tx2 * + (-(u[k][j][i + 1][1] * tmp1) * (u[k][j][i + 1][1] * tmp1) + + C2 * qs[k][j][i + 1] * tmp1) - + dt * tx1 * (-r43 * c34 * tmp2 * u[k][j][i + 1][1]); + a[j][i][1][1] = dt * tx2 * ((2.0 - C2) * (u[k][j][i + 1][1] * tmp1)) - + dt * tx1 * (r43 * c34 * tmp1) - dt * tx1 * dx2; + a[j][i][2][1] = dt * tx2 * (-C2 * (u[k][j][i + 1][2] * tmp1)); + a[j][i][3][1] = dt * tx2 * (-C2 * (u[k][j][i + 1][3] * tmp1)); + a[j][i][4][1] = dt * tx2 * C2; + a[j][i][0][2] = dt * tx2 * (-(u[k][j][i + 1][1] * u[k][j][i + 1][2]) * tmp2) - + dt * tx1 * (-c34 * tmp2 * u[k][j][i + 1][2]); + a[j][i][1][2] = dt * tx2 * (u[k][j][i + 1][2] * tmp1); + a[j][i][2][2] = dt * tx2 * (u[k][j][i + 1][1] * tmp1) - + dt * tx1 * (c34 * tmp1) - dt * tx1 * dx3; + a[j][i][3][2] = 0.0; + a[j][i][4][2] = 0.0; + a[j][i][0][3] = dt * tx2 * (-(u[k][j][i + 1][1] * u[k][j][i + 1][3]) * tmp2) - + dt * tx1 * (-c34 * tmp2 * u[k][j][i + 1][3]); + a[j][i][1][3] = dt * tx2 * (u[k][j][i + 1][3] * tmp1); + a[j][i][2][3] = 0.0; + a[j][i][3][3] = dt * tx2 * (u[k][j][i + 1][1] * tmp1) - + dt * tx1 * (c34 * tmp1) - dt * tx1 * dx4; + a[j][i][4][3] = 0.0; + a[j][i][0][4] = + dt * tx2 * + ((C2 * 2.0 * qs[k][j][i + 1] - C1 * u[k][j][i + 1][4]) * + (u[k][j][i + 1][1] * tmp2)) - + dt * tx1 * + (-(r43 * c34 - c1345) * tmp3 * + (u[k][j][i + 1][1] * u[k][j][i + 1][1]) - + (c34 - c1345) * tmp3 * (u[k][j][i + 1][2] * u[k][j][i + 1][2]) - + (c34 - c1345) * tmp3 * (u[k][j][i + 1][3] * u[k][j][i + 1][3]) - + c1345 * tmp2 * u[k][j][i + 1][4]); + a[j][i][1][4] = dt * tx2 * + (C1 * (u[k][j][i + 1][4] * tmp1) - + C2 * (u[k][j][i + 1][1] * u[k][j][i + 1][1] * tmp2 + + qs[k][j][i + 1] * tmp1)) - + dt * tx1 * (r43 * c34 - c1345) * tmp2 * u[k][j][i + 1][1]; + a[j][i][2][4] = + dt * tx2 * (-C2 * (u[k][j][i + 1][2] * u[k][j][i + 1][1]) * tmp2) - + dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i + 1][2]; + a[j][i][3][4] = + dt * tx2 * (-C2 * (u[k][j][i + 1][3] * u[k][j][i + 1][1]) * tmp2) - + dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i + 1][3]; + a[j][i][4][4] = dt * tx2 * (C1 * (u[k][j][i + 1][1] * tmp1)) - + dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5; + /* + * --------------------------------------------------------------------- + * form the second block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k][j + 1][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + b[j][i][0][0] = -dt * ty1 * dy1; + b[j][i][1][0] = 0.0; + b[j][i][2][0] = dt * ty2; + b[j][i][3][0] = 0.0; + b[j][i][4][0] = 0.0; + b[j][i][0][1] = dt * ty2 * (-(u[k][j + 1][i][1] * u[k][j + 1][i][2]) * tmp2) - + dt * ty1 * (-c34 * tmp2 * u[k][j + 1][i][1]); + b[j][i][1][1] = dt * ty2 * (u[k][j + 1][i][2] * tmp1) - + dt * ty1 * (c34 * tmp1) - dt * ty1 * dy2; + b[j][i][2][1] = dt * ty2 * (u[k][j + 1][i][1] * tmp1); + b[j][i][3][1] = 0.0; + b[j][i][4][1] = 0.0; + b[j][i][0][2] = + dt * ty2 * + (-(u[k][j + 1][i][2] * tmp1) * (u[k][j + 1][i][2] * tmp1) + + C2 * (qs[k][j + 1][i] * tmp1)) - + dt * ty1 * (-r43 * c34 * tmp2 * u[k][j + 1][i][2]); + b[j][i][1][2] = dt * ty2 * (-C2 * (u[k][j + 1][i][1] * tmp1)); + b[j][i][2][2] = dt * ty2 * ((2.0 - C2) * (u[k][j + 1][i][2] * tmp1)) - + dt * ty1 * (r43 * c34 * tmp1) - dt * ty1 * dy3; + b[j][i][3][2] = dt * ty2 * (-C2 * (u[k][j + 1][i][3] * tmp1)); + b[j][i][4][2] = dt * ty2 * C2; + b[j][i][0][3] = dt * ty2 * (-(u[k][j + 1][i][2] * u[k][j + 1][i][3]) * tmp2) - + dt * ty1 * (-c34 * tmp2 * u[k][j + 1][i][3]); + b[j][i][1][3] = 0.0; + b[j][i][2][3] = dt * ty2 * (u[k][j + 1][i][3] * tmp1); + b[j][i][3][3] = dt * ty2 * (u[k][j + 1][i][2] * tmp1) - + dt * ty1 * (c34 * tmp1) - dt * ty1 * dy4; + b[j][i][4][3] = 0.0; + b[j][i][0][4] = + dt * ty2 * + ((C2 * 2.0 * qs[k][j + 1][i] - C1 * u[k][j + 1][i][4]) * + (u[k][j + 1][i][2] * tmp2)) - + dt * ty1 * + (-(c34 - c1345) * tmp3 * (u[k][j + 1][i][1] * u[k][j + 1][i][1]) - + (r43 * c34 - c1345) * tmp3 * + (u[k][j + 1][i][2] * u[k][j + 1][i][2]) - + (c34 - c1345) * tmp3 * (u[k][j + 1][i][3] * u[k][j + 1][i][3]) - + c1345 * tmp2 * u[k][j + 1][i][4]); + b[j][i][1][4] = + dt * ty2 * (-C2 * (u[k][j + 1][i][1] * u[k][j + 1][i][2]) * tmp2) - + dt * ty1 * (c34 - c1345) * tmp2 * u[k][j + 1][i][1]; + b[j][i][2][4] = dt * ty2 * + (C1 * (u[k][j + 1][i][4] * tmp1) - + C2 * (qs[k][j + 1][i] * tmp1 + + u[k][j + 1][i][2] * u[k][j + 1][i][2] * tmp2)) - + dt * ty1 * (r43 * c34 - c1345) * tmp2 * u[k][j + 1][i][2]; + b[j][i][3][4] = + dt * ty2 * (-C2 * (u[k][j + 1][i][2] * u[k][j + 1][i][3]) * tmp2) - + dt * ty1 * (c34 - c1345) * tmp2 * u[k][j + 1][i][3]; + b[j][i][4][4] = dt * ty2 * (C1 * (u[k][j + 1][i][2] * tmp1)) - + dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5; + /* + * --------------------------------------------------------------------- + * form the third block sub-diagonal + * --------------------------------------------------------------------- + */ + tmp1 = rho_i[k + 1][j][i]; + tmp2 = tmp1 * tmp1; + tmp3 = tmp1 * tmp2; + c[j][i][0][0] = -dt * tz1 * dz1; + c[j][i][1][0] = 0.0; + c[j][i][2][0] = 0.0; + c[j][i][3][0] = dt * tz2; + c[j][i][4][0] = 0.0; + c[j][i][0][1] = dt * tz2 * (-(u[k + 1][j][i][1] * u[k + 1][j][i][3]) * tmp2) - + dt * tz1 * (-c34 * tmp2 * u[k + 1][j][i][1]); + c[j][i][1][1] = dt * tz2 * (u[k + 1][j][i][3] * tmp1) - + dt * tz1 * c34 * tmp1 - dt * tz1 * dz2; + c[j][i][2][1] = 0.0; + c[j][i][3][1] = dt * tz2 * (u[k + 1][j][i][1] * tmp1); + c[j][i][4][1] = 0.0; + c[j][i][0][2] = dt * tz2 * (-(u[k + 1][j][i][2] * u[k + 1][j][i][3]) * tmp2) - + dt * tz1 * (-c34 * tmp2 * u[k + 1][j][i][2]); + c[j][i][1][2] = 0.0; + c[j][i][2][2] = dt * tz2 * (u[k + 1][j][i][3] * tmp1) - + dt * tz1 * (c34 * tmp1) - dt * tz1 * dz3; + c[j][i][3][2] = dt * tz2 * (u[k + 1][j][i][2] * tmp1); + c[j][i][4][2] = 0.0; + c[j][i][0][3] = + dt * tz2 * + (-(u[k + 1][j][i][3] * tmp1) * (u[k + 1][j][i][3] * tmp1) + + C2 * (qs[k + 1][j][i] * tmp1)) - + dt * tz1 * (-r43 * c34 * tmp2 * u[k + 1][j][i][3]); + c[j][i][1][3] = dt * tz2 * (-C2 * (u[k + 1][j][i][1] * tmp1)); + c[j][i][2][3] = dt * tz2 * (-C2 * (u[k + 1][j][i][2] * tmp1)); + c[j][i][3][3] = dt * tz2 * (2.0 - C2) * (u[k + 1][j][i][3] * tmp1) - + dt * tz1 * (r43 * c34 * tmp1) - dt * tz1 * dz4; + c[j][i][4][3] = dt * tz2 * C2; + c[j][i][0][4] = + dt * tz2 * + ((C2 * 2.0 * qs[k + 1][j][i] - C1 * u[k + 1][j][i][4]) * + (u[k + 1][j][i][3] * tmp2)) - + dt * tz1 * + (-(c34 - c1345) * tmp3 * (u[k + 1][j][i][1] * u[k + 1][j][i][1]) - + (c34 - c1345) * tmp3 * (u[k + 1][j][i][2] * u[k + 1][j][i][2]) - + (r43 * c34 - c1345) * tmp3 * + (u[k + 1][j][i][3] * u[k + 1][j][i][3]) - + c1345 * tmp2 * u[k + 1][j][i][4]); + c[j][i][1][4] = + dt * tz2 * (-C2 * (u[k + 1][j][i][1] * u[k + 1][j][i][3]) * tmp2) - + dt * tz1 * (c34 - c1345) * tmp2 * u[k + 1][j][i][1]; + c[j][i][2][4] = + dt * tz2 * (-C2 * (u[k + 1][j][i][2] * u[k + 1][j][i][3]) * tmp2) - + dt * tz1 * (c34 - c1345) * tmp2 * u[k + 1][j][i][2]; + c[j][i][3][4] = dt * tz2 * + (C1 * (u[k + 1][j][i][4] * tmp1) - + C2 * (qs[k + 1][j][i] * tmp1 + + u[k + 1][j][i][3] * u[k + 1][j][i][3] * tmp2)) - + dt * tz1 * (r43 * c34 - c1345) * tmp2 * u[k + 1][j][i][3]; + c[j][i][4][4] = dt * tz2 * (C1 * (u[k + 1][j][i][3] * tmp1)) - + dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5; + } + } +} + +/* + * --------------------------------------------------------------------- + * to compute the l2-norm of vector v. + * --------------------------------------------------------------------- + * to improve cache performance, second two dimensions padded by 1 + * for even number sizes only. Only needed in v. + * --------------------------------------------------------------------- + */ +void +l2norm(int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend, + double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], double sum[]) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0; + +#pragma omp single + for(m = 0; m < 5; m++) + { + sum[m] = 0.0; + } + +#pragma omp for nowait + for(k = 1; k < nz0 - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + sum0 = sum0 + v[i][j][k][0] * v[i][j][k][0]; + sum1 = sum1 + v[i][j][k][1] * v[i][j][k][1]; + sum2 = sum2 + v[i][j][k][2] * v[i][j][k][2]; + sum3 = sum3 + v[i][j][k][3] * v[i][j][k][3]; + sum4 = sum4 + v[i][j][k][4] * v[i][j][k][4]; + } + } + } + +#pragma omp critical + { + sum[0] += sum0; + sum[1] += sum1; + sum[2] += sum2; + sum[3] += sum3; + sum[4] += sum4; + } +#pragma omp barrier + +#pragma omp single + for(m = 0; m < 5; m++) + { + sum[m] = sqrt(sum[m] / ((nx0 - 2) * (ny0 - 2) * (nz0 - 2))); + } +} + +void +pintgr() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k; + int ibeg, ifin, ifin1; + int jbeg, jfin, jfin1; + double phi1[ISIZ3 + 2][ISIZ2 + 2]; + double phi2[ISIZ3 + 2][ISIZ2 + 2]; + double frc1, frc2, frc3; + /* + * --------------------------------------------------------------------- + * set up the sub-domains for integeration in each processor + * --------------------------------------------------------------------- + */ + ibeg = ii1; + ifin = ii2; + jbeg = ji1; + jfin = ji2; + ifin1 = ifin - 1; + jfin1 = jfin - 1; + /* + * --------------------------------------------------------------------- + * initialize + * --------------------------------------------------------------------- + */ + for(i = 0; i <= ISIZ2 + 1; i++) + { + for(k = 0; k <= ISIZ3 + 1; k++) + { + phi1[k][i] = 0.0; + phi2[k][i] = 0.0; + } + } + for(j = jbeg; j < jfin; j++) + { + for(i = ibeg; i < ifin; i++) + { + k = ki1; + phi1[j][i] = C2 * (u[k][j][i][4] - 0.50 * + (u[k][j][i][1] * u[k][j][i][1] + + u[k][j][i][2] * u[k][j][i][2] + + u[k][j][i][3] * u[k][j][i][3]) / + u[k][j][i][0]); + k = ki2 - 1; + phi2[j][i] = C2 * (u[k][j][i][4] - 0.50 * + (u[k][j][i][1] * u[k][j][i][1] + + u[k][j][i][2] * u[k][j][i][2] + + u[k][j][i][3] * u[k][j][i][3]) / + u[k][j][i][0]); + } + } + frc1 = 0.0; + for(j = jbeg; j < jfin1; j++) + { + for(i = ibeg; i < ifin1; i++) + { + frc1 = frc1 + + (phi1[j][i] + phi1[j][i + 1] + phi1[j + 1][i] + phi1[j + 1][i + 1] + + phi2[j][i] + phi2[j][i + 1] + phi2[j + 1][i] + phi2[j + 1][i + 1]); + } + } + frc1 = dxi * deta * frc1; + /* + * --------------------------------------------------------------------- + * initialize + * --------------------------------------------------------------------- + */ + for(i = 0; i <= ISIZ2 + 1; i++) + { + for(k = 0; k <= ISIZ3 + 1; k++) + { + phi1[k][i] = 0.0; + phi2[k][i] = 0.0; + } + } + if(jbeg == ji1) + { + for(k = ki1; k < ki2; k++) + { + for(i = ibeg; i < ifin; i++) + { + phi1[k][i] = + C2 * (u[k][jbeg][i][4] - 0.50 * + (u[k][jbeg][i][1] * u[k][jbeg][i][1] + + u[k][jbeg][i][2] * u[k][jbeg][i][2] + + u[k][jbeg][i][3] * u[k][jbeg][i][3]) / + u[k][jbeg][i][0]); + } + } + } + if(jfin == ji2) + { + for(k = ki1; k < ki2; k++) + { + for(i = ibeg; i < ifin; i++) + { + phi2[k][i] = C2 * (u[k][jfin - 1][i][4] - + 0.50 * + (u[k][jfin - 1][i][1] * u[k][jfin - 1][i][1] + + u[k][jfin - 1][i][2] * u[k][jfin - 1][i][2] + + u[k][jfin - 1][i][3] * u[k][jfin - 1][i][3]) / + u[k][jfin - 1][i][0]); + } + } + } + frc2 = 0.0; + for(k = ki1; k < ki2 - 1; k++) + { + for(i = ibeg; i < ifin1; i++) + { + frc2 = frc2 + + (phi1[k][i] + phi1[k][i + 1] + phi1[k + 1][i] + phi1[k + 1][i + 1] + + phi2[k][i] + phi2[k][i + 1] + phi2[k + 1][i] + phi2[k + 1][i + 1]); + } + } + frc2 = dxi * dzeta * frc2; + /* + * --------------------------------------------------------------------- + * initialize + * --------------------------------------------------------------------- + */ + for(i = 0; i <= ISIZ2 + 1; i++) + { + for(k = 0; k <= ISIZ3 + 1; k++) + { + phi1[k][i] = 0.0; + phi2[k][i] = 0.0; + } + } + if(ibeg == ii1) + { + for(k = ki1; k < ki2; k++) + { + for(j = jbeg; j < jfin; j++) + { + phi1[k][j] = + C2 * (u[k][j][ibeg][4] - 0.50 * + (u[k][j][ibeg][1] * u[k][j][ibeg][1] + + u[k][j][ibeg][2] * u[k][j][ibeg][2] + + u[k][j][ibeg][3] * u[k][j][ibeg][3]) / + u[k][j][ibeg][0]); + } + } + } + if(ifin == ii2) + { + for(k = ki1; k < ki2; k++) + { + for(j = jbeg; j < jfin; j++) + { + phi2[k][j] = C2 * (u[k][j][ifin - 1][4] - + 0.50 * + (u[k][j][ifin - 1][1] * u[k][j][ifin - 1][1] + + u[k][j][ifin - 1][2] * u[k][j][ifin - 1][2] + + u[k][j][ifin - 1][3] * u[k][j][ifin - 1][3]) / + u[k][j][ifin - 1][0]); + } + } + } + frc3 = 0.0; + for(k = ki1; k < ki2 - 1; k++) + { + for(j = jbeg; j < jfin1; j++) + { + frc3 = frc3 + + (phi1[k][j] + phi1[k][j + 1] + phi1[k + 1][j] + phi1[k + 1][j + 1] + + phi2[k][j] + phi2[k][j + 1] + phi2[k + 1][j] + phi2[k + 1][j + 1]); + } + } + frc3 = deta * dzeta * frc3; + frc = 0.25 * (frc1 + frc2 + frc3); +} + +void +read_input() +{ + /* + * --------------------------------------------------------------------- + * if input file does not exist, it uses defaults + * ipr = 1 for detailed progress output + * inorm = how often the norm is printed (once every inorm iterations) + * itmax = number of pseudo time steps + * dt = time step + * omega 1 over-relaxation factor for SSOR + * tolrsd = steady state residual tolerance levels + * nx, ny, nz = number of grid points in x, y, z directions + * --------------------------------------------------------------------- + */ + FILE* fp; + int avoid_warning; + if((fp = fopen("inputlu.data", "r")) != NULL) + { + printf("Reading from input file inputlu.data\n"); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%d%d", &ipr, &inorm); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%d", &itmax); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%lf", &dt); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%lf", &omega); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%lf%lf%lf%lf%lf", &tolrsd[0], &tolrsd[1], &tolrsd[2], + &tolrsd[3], &tolrsd[4]); + while(fgetc(fp) != '\n') + ; + while(fgetc(fp) != '\n') + ; + avoid_warning = fscanf(fp, "%d%d%d", &nx0, &ny0, &nz0); + fclose(fp); + } + else + { + ipr = IPR_DEFAULT; + inorm = INORM_DEFAULT; + itmax = ITMAX_DEFAULT; + dt = DT_DEFAULT; + omega = OMEGA_DEFAULT; + tolrsd[0] = TOLRSD1_DEF; + tolrsd[1] = TOLRSD2_DEF; + tolrsd[2] = TOLRSD3_DEF; + tolrsd[3] = TOLRSD4_DEF; + tolrsd[4] = TOLRSD5_DEF; + nx0 = ISIZ1; + ny0 = ISIZ2; + nz0 = ISIZ3; + } + /* + * --------------------------------------------------------------------- + * check problem size + * --------------------------------------------------------------------- + */ + if((nx0 < 4) || (ny0 < 4) || (nz0 < 4)) + { + printf(" PROBLEM SIZE IS TOO SMALL - \n" + " SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5\n"); + exit(EXIT_FAILURE); + } + if((nx0 > ISIZ1) || (ny0 > ISIZ2) || (nz0 > ISIZ3)) + { + printf(" PROBLEM SIZE IS TOO LARGE - \n" + " NX, NY AND NZ SHOULD BE EQUAL TO \n" + " ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY\n"); + exit(EXIT_FAILURE); + } + printf("\n\n NAS Parallel Benchmarks 4.1 Parallel C++ version with OpenMP - LU " + "Benchmark\n\n"); + printf(" Size: %4dx%4dx%4d\n", nx0, ny0, nz0); + printf(" Iterations: %4d\n", itmax); + printf("\n"); +} + +/* + * --------------------------------------------------------------------- + * compute the right hand sides + * --------------------------------------------------------------------- + */ +void +rhs() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double q; + double tmp, utmp[ISIZ3][6], rtmp[ISIZ3][5]; + double u21, u31, u41; + double u21i, u31i, u41i, u51i; + double u21j, u31j, u41j, u51j; + double u21k, u31k, u41k, u51k; + double u21im1, u31im1, u41im1, u51im1; + double u21jm1, u31jm1, u41jm1, u51jm1; + double u21km1, u31km1, u41km1, u51km1; + double flux[ISIZ1][5]; + + if(timeron) + { + timer_start(T_RHS); + } +#pragma omp for + for(k = 0; k < nz; k++) + { + for(j = 0; j < ny; j++) + { + for(i = 0; i < nx; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = -frct[k][j][i][m]; + } + tmp = 1.0 / u[k][j][i][0]; + rho_i[k][j][i] = tmp; + qs[k][j][i] = + 0.50 * + (u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + + u[k][j][i][3] * u[k][j][i][3]) * + tmp; + } + } + } + if(timeron) + { + timer_start(T_RHSX); + } +/* + * --------------------------------------------------------------------- + * xi-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = 0; i < nx; i++) + { + flux[i][0] = u[k][j][i][1]; + u21 = u[k][j][i][1] * rho_i[k][j][i]; + q = qs[k][j][i]; + flux[i][1] = u[k][j][i][1] * u21 + C2 * (u[k][j][i][4] - q); + flux[i][2] = u[k][j][i][2] * u21; + flux[i][3] = u[k][j][i][3] * u21; + flux[i][4] = (C1 * u[k][j][i][4] - C2 * q) * u21; + } + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + rsd[k][j][i][m] - tx2 * (flux[i + 1][m] - flux[i - 1][m]); + } + } + for(i = ist; i < nx; i++) + { + tmp = rho_i[k][j][i]; + u21i = tmp * u[k][j][i][1]; + u31i = tmp * u[k][j][i][2]; + u41i = tmp * u[k][j][i][3]; + u51i = tmp * u[k][j][i][4]; + tmp = rho_i[k][j][i - 1]; + u21im1 = tmp * u[k][j][i - 1][1]; + u31im1 = tmp * u[k][j][i - 1][2]; + u41im1 = tmp * u[k][j][i - 1][3]; + u51im1 = tmp * u[k][j][i - 1][4]; + flux[i][1] = (4.0 / 3.0) * tx3 * (u21i - u21im1); + flux[i][2] = tx3 * (u31i - u31im1); + flux[i][3] = tx3 * (u41i - u41im1); + flux[i][4] = 0.50 * (1.0 - C1 * C5) * tx3 * + ((u21i * u21i + u31i * u31i + u41i * u41i) - + (u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1)) + + (1.0 / 6.0) * tx3 * (u21i * u21i - u21im1 * u21im1) + + C1 * C5 * tx3 * (u51i - u51im1); + } + for(i = ist; i < iend; i++) + { + rsd[k][j][i][0] = + rsd[k][j][i][0] + + dx1 * tx1 * + (u[k][j][i - 1][0] - 2.0 * u[k][j][i][0] + u[k][j][i + 1][0]); + rsd[k][j][i][1] = + rsd[k][j][i][1] + tx3 * C3 * C4 * (flux[i + 1][1] - flux[i][1]) + + dx2 * tx1 * + (u[k][j][i - 1][1] - 2.0 * u[k][j][i][1] + u[k][j][i + 1][1]); + rsd[k][j][i][2] = + rsd[k][j][i][2] + tx3 * C3 * C4 * (flux[i + 1][2] - flux[i][2]) + + dx3 * tx1 * + (u[k][j][i - 1][2] - 2.0 * u[k][j][i][2] + u[k][j][i + 1][2]); + rsd[k][j][i][3] = + rsd[k][j][i][3] + tx3 * C3 * C4 * (flux[i + 1][3] - flux[i][3]) + + dx4 * tx1 * + (u[k][j][i - 1][3] - 2.0 * u[k][j][i][3] + u[k][j][i + 1][3]); + rsd[k][j][i][4] = + rsd[k][j][i][4] + tx3 * C3 * C4 * (flux[i + 1][4] - flux[i][4]) + + dx5 * tx1 * + (u[k][j][i - 1][4] - 2.0 * u[k][j][i][4] + u[k][j][i + 1][4]); + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + rsd[k][j][1][m] = + rsd[k][j][1][m] - + dssp * (+5.0 * u[k][j][1][m] - 4.0 * u[k][j][2][m] + u[k][j][3][m]); + rsd[k][j][2][m] = + rsd[k][j][2][m] - dssp * (-4.0 * u[k][j][1][m] + 6.0 * u[k][j][2][m] - + 4.0 * u[k][j][3][m] + u[k][j][4][m]); + } + for(i = 3; i < nx - 3; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + rsd[k][j][i][m] - + dssp * (u[k][j][i - 2][m] - 4.0 * u[k][j][i - 1][m] + + 6.0 * u[k][j][i][m] - 4.0 * u[k][j][i + 1][m] + + u[k][j][i + 2][m]); + } + } + for(m = 0; m < 5; m++) + { + rsd[k][j][nx - 3][m] = + rsd[k][j][nx - 3][m] - + dssp * (u[k][j][nx - 5][m] - 4.0 * u[k][j][nx - 4][m] + + 6.0 * u[k][j][nx - 3][m] - 4.0 * u[k][j][nx - 2][m]); + rsd[k][j][nx - 2][m] = + rsd[k][j][nx - 2][m] - + dssp * (u[k][j][nx - 4][m] - 4.0 * u[k][j][nx - 3][m] + + 5.0 * u[k][j][nx - 2][m]); + } + } + } + if(timeron) + { + timer_stop(T_RHSX); + } + if(timeron) + { + timer_start(T_RHSY); + } +/* + * --------------------------------------------------------------------- + * eta-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(i = ist; i < iend; i++) + { + for(j = 0; j < ny; j++) + { + flux[j][0] = u[k][j][i][2]; + u31 = u[k][j][i][2] * rho_i[k][j][i]; + q = qs[k][j][i]; + flux[j][1] = u[k][j][i][1] * u31; + flux[j][2] = u[k][j][i][2] * u31 + C2 * (u[k][j][i][4] - q); + flux[j][3] = u[k][j][i][3] * u31; + flux[j][4] = (C1 * u[k][j][i][4] - C2 * q) * u31; + } + for(j = jst; j < jend; j++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + rsd[k][j][i][m] - ty2 * (flux[j + 1][m] - flux[j - 1][m]); + } + } + for(j = jst; j < ny; j++) + { + tmp = rho_i[k][j][i]; + u21j = tmp * u[k][j][i][1]; + u31j = tmp * u[k][j][i][2]; + u41j = tmp * u[k][j][i][3]; + u51j = tmp * u[k][j][i][4]; + tmp = rho_i[k][j - 1][i]; + u21jm1 = tmp * u[k][j - 1][i][1]; + u31jm1 = tmp * u[k][j - 1][i][2]; + u41jm1 = tmp * u[k][j - 1][i][3]; + u51jm1 = tmp * u[k][j - 1][i][4]; + flux[j][1] = ty3 * (u21j - u21jm1); + flux[j][2] = (4.0 / 3.0) * ty3 * (u31j - u31jm1); + flux[j][3] = ty3 * (u41j - u41jm1); + flux[j][4] = 0.50 * (1.0 - C1 * C5) * ty3 * + ((u21j * u21j + u31j * u31j + u41j * u41j) - + (u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1)) + + (1.0 / 6.0) * ty3 * (u31j * u31j - u31jm1 * u31jm1) + + C1 * C5 * ty3 * (u51j - u51jm1); + } + for(j = jst; j < jend; j++) + { + rsd[k][j][i][0] = + rsd[k][j][i][0] + + dy1 * ty1 * + (u[k][j - 1][i][0] - 2.0 * u[k][j][i][0] + u[k][j + 1][i][0]); + rsd[k][j][i][1] = + rsd[k][j][i][1] + ty3 * C3 * C4 * (flux[j + 1][1] - flux[j][1]) + + dy2 * ty1 * + (u[k][j - 1][i][1] - 2.0 * u[k][j][i][1] + u[k][j + 1][i][1]); + rsd[k][j][i][2] = + rsd[k][j][i][2] + ty3 * C3 * C4 * (flux[j + 1][2] - flux[j][2]) + + dy3 * ty1 * + (u[k][j - 1][i][2] - 2.0 * u[k][j][i][2] + u[k][j + 1][i][2]); + rsd[k][j][i][3] = + rsd[k][j][i][3] + ty3 * C3 * C4 * (flux[j + 1][3] - flux[j][3]) + + dy4 * ty1 * + (u[k][j - 1][i][3] - 2.0 * u[k][j][i][3] + u[k][j + 1][i][3]); + rsd[k][j][i][4] = + rsd[k][j][i][4] + ty3 * C3 * C4 * (flux[j + 1][4] - flux[j][4]) + + dy5 * ty1 * + (u[k][j - 1][i][4] - 2.0 * u[k][j][i][4] + u[k][j + 1][i][4]); + } + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][1][i][m] = + rsd[k][1][i][m] - + dssp * (+5.0 * u[k][1][i][m] - 4.0 * u[k][2][i][m] + u[k][3][i][m]); + rsd[k][2][i][m] = + rsd[k][2][i][m] - dssp * (-4.0 * u[k][1][i][m] + 6.0 * u[k][2][i][m] - + 4.0 * u[k][3][i][m] + u[k][4][i][m]); + } + } + for(j = 3; j < ny - 3; j++) + { + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + rsd[k][j][i][m] - + dssp * (u[k][j - 2][i][m] - 4.0 * u[k][j - 1][i][m] + + 6.0 * u[k][j][i][m] - 4.0 * u[k][j + 1][i][m] + + u[k][j + 2][i][m]); + } + } + } + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][ny - 3][i][m] = + rsd[k][ny - 3][i][m] - + dssp * (u[k][ny - 5][i][m] - 4.0 * u[k][ny - 4][i][m] + + 6.0 * u[k][ny - 3][i][m] - 4.0 * u[k][ny - 2][i][m]); + rsd[k][ny - 2][i][m] = + rsd[k][ny - 2][i][m] - + dssp * (u[k][ny - 4][i][m] - 4.0 * u[k][ny - 3][i][m] + + 5.0 * u[k][ny - 2][i][m]); + } + } + } + if(timeron) + { + timer_stop(T_RHSY); + } + if(timeron) + { + timer_start(T_RHSZ); + } +/* + * --------------------------------------------------------------------- + * zeta-direction flux differences + * --------------------------------------------------------------------- + */ +#pragma omp for + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + for(k = 0; k < nz; k++) + { + utmp[k][0] = u[k][j][i][0]; + utmp[k][1] = u[k][j][i][1]; + utmp[k][2] = u[k][j][i][2]; + utmp[k][3] = u[k][j][i][3]; + utmp[k][4] = u[k][j][i][4]; + utmp[k][5] = rho_i[k][j][i]; + } + for(k = 0; k < nz; k++) + { + flux[k][0] = utmp[k][3]; + u41 = utmp[k][3] * utmp[k][5]; + q = qs[k][j][i]; + flux[k][1] = utmp[k][1] * u41; + flux[k][2] = utmp[k][2] * u41; + flux[k][3] = utmp[k][3] * u41 + C2 * (utmp[k][4] - q); + flux[k][4] = (C1 * utmp[k][4] - C2 * q) * u41; + } + for(k = 1; k < nz - 1; k++) + { + for(m = 0; m < 5; m++) + { + rtmp[k][m] = + rsd[k][j][i][m] - tz2 * (flux[k + 1][m] - flux[k - 1][m]); + } + } + for(k = 1; k < nz; k++) + { + tmp = utmp[k][5]; + u21k = tmp * utmp[k][1]; + u31k = tmp * utmp[k][2]; + u41k = tmp * utmp[k][3]; + u51k = tmp * utmp[k][4]; + tmp = utmp[k - 1][5]; + u21km1 = tmp * utmp[k - 1][1]; + u31km1 = tmp * utmp[k - 1][2]; + u41km1 = tmp * utmp[k - 1][3]; + u51km1 = tmp * utmp[k - 1][4]; + flux[k][1] = tz3 * (u21k - u21km1); + flux[k][2] = tz3 * (u31k - u31km1); + flux[k][3] = (4.0 / 3.0) * tz3 * (u41k - u41km1); + flux[k][4] = 0.50 * (1.0 - C1 * C5) * tz3 * + ((u21k * u21k + u31k * u31k + u41k * u41k) - + (u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1)) + + (1.0 / 6.0) * tz3 * (u41k * u41k - u41km1 * u41km1) + + C1 * C5 * tz3 * (u51k - u51km1); + } + for(k = 1; k < nz - 1; k++) + { + rtmp[k][0] = + rtmp[k][0] + + dz1 * tz1 * (utmp[k - 1][0] - 2.0 * utmp[k][0] + utmp[k + 1][0]); + rtmp[k][1] = + rtmp[k][1] + tz3 * C3 * C4 * (flux[k + 1][1] - flux[k][1]) + + dz2 * tz1 * (utmp[k - 1][1] - 2.0 * utmp[k][1] + utmp[k + 1][1]); + rtmp[k][2] = + rtmp[k][2] + tz3 * C3 * C4 * (flux[k + 1][2] - flux[k][2]) + + dz3 * tz1 * (utmp[k - 1][2] - 2.0 * utmp[k][2] + utmp[k + 1][2]); + rtmp[k][3] = + rtmp[k][3] + tz3 * C3 * C4 * (flux[k + 1][3] - flux[k][3]) + + dz4 * tz1 * (utmp[k - 1][3] - 2.0 * utmp[k][3] + utmp[k + 1][3]); + rtmp[k][4] = + rtmp[k][4] + tz3 * C3 * C4 * (flux[k + 1][4] - flux[k][4]) + + dz5 * tz1 * (utmp[k - 1][4] - 2.0 * utmp[k][4] + utmp[k + 1][4]); + } + /* + * --------------------------------------------------------------------- + * fourth-order dissipation + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + rsd[1][j][i][m] = rtmp[1][m] - dssp * (+5.0 * utmp[1][m] - + 4.0 * utmp[2][m] + utmp[3][m]); + rsd[2][j][i][m] = + rtmp[2][m] - dssp * (-4.0 * utmp[1][m] + 6.0 * utmp[2][m] - + 4.0 * utmp[3][m] + utmp[4][m]); + } + for(k = 3; k < nz - 3; k++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = + rtmp[k][m] - + dssp * (utmp[k - 2][m] - 4.0 * utmp[k - 1][m] + 6.0 * utmp[k][m] - + 4.0 * utmp[k + 1][m] + utmp[k + 2][m]); + } + } + for(m = 0; m < 5; m++) + { + rsd[nz - 3][j][i][m] = + rtmp[nz - 3][m] - + dssp * (utmp[nz - 5][m] - 4.0 * utmp[nz - 4][m] + + 6.0 * utmp[nz - 3][m] - 4.0 * utmp[nz - 2][m]); + rsd[nz - 2][j][i][m] = + rtmp[nz - 2][m] - dssp * (utmp[nz - 4][m] - 4.0 * utmp[nz - 3][m] + + 5.0 * utmp[nz - 2][m]); + } + } + } + if(timeron) + { + timer_stop(T_RHSZ); + } + if(timeron) + { + timer_stop(T_RHS); + } +} + +/* + * --------------------------------------------------------------------- + * set the boundary values of dependent variables + * --------------------------------------------------------------------- + */ +void +setbv() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double temp1[5], temp2[5]; +/* + * --------------------------------------------------------------------- + * set the dependent variable values along the top and bottom faces + * --------------------------------------------------------------------- + */ +#pragma omp for + for(j = 0; j < ny; j++) + { + for(i = 0; i < nx; i++) + { + exact(i, j, 0, temp1); + exact(i, j, nz - 1, temp2); + for(m = 0; m < 5; m++) + { + u[0][j][i][m] = temp1[m]; + u[nz - 1][j][i][m] = temp2[m]; + } + } + } +/* + * --------------------------------------------------------------------- + * set the dependent variable values along north and south faces + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 0; k < nz; k++) + { + for(i = 0; i < nx; i++) + { + exact(i, 0, k, temp1); + exact(i, ny - 1, k, temp2); + for(m = 0; m < 5; m++) + { + u[k][0][i][m] = temp1[m]; + u[k][ny - 1][i][m] = temp2[m]; + } + } + } +/* + * --------------------------------------------------------------------- + * set the dependent variable values along east and west faces + * --------------------------------------------------------------------- + */ +#pragma omp for + for(k = 0; k < nz; k++) + { + for(j = 0; j < ny; j++) + { + exact(0, j, k, temp1); + exact(nx - 1, j, k, temp2); + for(m = 0; m < 5; m++) + { + u[k][j][0][m] = temp1[m]; + u[k][j][nx - 1][m] = temp2[m]; + } + } + } +} + +void +setcoeff() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + * set up coefficients + * --------------------------------------------------------------------- + */ + dxi = 1.0 / (nx0 - 1); + deta = 1.0 / (ny0 - 1); + dzeta = 1.0 / (nz0 - 1); + tx1 = 1.0 / (dxi * dxi); + tx2 = 1.0 / (2.0 * dxi); + tx3 = 1.0 / dxi; + ty1 = 1.0 / (deta * deta); + ty2 = 1.0 / (2.0 * deta); + ty3 = 1.0 / deta; + tz1 = 1.0 / (dzeta * dzeta); + tz2 = 1.0 / (2.0 * dzeta); + tz3 = 1.0 / dzeta; + /* + * --------------------------------------------------------------------- + * diffusion coefficients + * --------------------------------------------------------------------- + */ + dx1 = 0.75; + dx2 = dx1; + dx3 = dx1; + dx4 = dx1; + dx5 = dx1; + dy1 = 0.75; + dy2 = dy1; + dy3 = dy1; + dy4 = dy1; + dy5 = dy1; + dz1 = 1.00; + dz2 = dz1; + dz3 = dz1; + dz4 = dz1; + dz5 = dz1; + /* + * --------------------------------------------------------------------- + * fourth difference dissipation + * --------------------------------------------------------------------- + */ + dssp = (max(max(dx1, dy1), dz1)) / 4.0; + /* + * --------------------------------------------------------------------- + * coefficients of the exact solution to the first pde + * --------------------------------------------------------------------- + */ + ce[0][0] = 2.0; + ce[1][0] = 0.0; + ce[2][0] = 0.0; + ce[3][0] = 4.0; + ce[4][0] = 5.0; + ce[5][0] = 3.0; + ce[6][0] = 5.0e-01; + ce[7][0] = 2.0e-02; + ce[8][0] = 1.0e-02; + ce[9][0] = 3.0e-02; + ce[10][0] = 5.0e-01; + ce[11][0] = 4.0e-01; + ce[12][0] = 3.0e-01; + /* + * --------------------------------------------------------------------- + * coefficients of the exact solution to the second pde + * --------------------------------------------------------------------- + */ + ce[0][1] = 1.0; + ce[1][1] = 0.0; + ce[2][1] = 0.0; + ce[3][1] = 0.0; + ce[4][1] = 1.0; + ce[5][1] = 2.0; + ce[6][1] = 3.0; + ce[7][1] = 1.0e-02; + ce[8][1] = 3.0e-02; + ce[9][1] = 2.0e-02; + ce[10][1] = 4.0e-01; + ce[11][1] = 3.0e-01; + ce[12][1] = 5.0e-01; + /* + * --------------------------------------------------------------------- + * coefficients of the exact solution to the third pde + * --------------------------------------------------------------------- + */ + ce[0][2] = 2.0; + ce[1][2] = 2.0; + ce[2][2] = 0.0; + ce[3][2] = 0.0; + ce[4][2] = 0.0; + ce[5][2] = 2.0; + ce[6][2] = 3.0; + ce[7][2] = 4.0e-02; + ce[8][2] = 3.0e-02; + ce[9][2] = 5.0e-02; + ce[10][2] = 3.0e-01; + ce[11][2] = 5.0e-01; + ce[12][2] = 4.0e-01; + /* + * --------------------------------------------------------------------- + * coefficients of the exact solution to the fourth pde + * --------------------------------------------------------------------- + */ + ce[0][3] = 2.0; + ce[1][3] = 2.0; + ce[2][3] = 0.0; + ce[3][3] = 0.0; + ce[4][3] = 0.0; + ce[5][3] = 2.0; + ce[6][3] = 3.0; + ce[7][3] = 3.0e-02; + ce[8][3] = 5.0e-02; + ce[9][3] = 4.0e-02; + ce[10][3] = 2.0e-01; + ce[11][3] = 1.0e-01; + ce[12][3] = 3.0e-01; + /* + * --------------------------------------------------------------------- + * coefficients of the exact solution to the fifth pde + * --------------------------------------------------------------------- + */ + ce[0][4] = 5.0; + ce[1][4] = 4.0; + ce[2][4] = 3.0; + ce[3][4] = 2.0; + ce[4][4] = 1.0e-01; + ce[5][4] = 4.0e-01; + ce[6][4] = 3.0e-01; + ce[7][4] = 5.0e-02; + ce[8][4] = 4.0e-02; + ce[9][4] = 3.0e-02; + ce[10][4] = 1.0e-01; + ce[11][4] = 3.0e-01; + ce[12][4] = 2.0e-01; +} + +/* + * --------------------------------------------------------------------- + * set the initial values of independent variables based on tri-linear + * interpolation of boundary values in the computational space. + * --------------------------------------------------------------------- + */ +void +setiv() +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m; + double xi, eta, zeta; + double pxi, peta, pzeta; + double ue_1jk[5], ue_nx0jk[5], ue_i1k[5]; + double ue_iny0k[5], ue_ij1[5], ue_ijnz[5]; + +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + zeta = ((double) k) / (nz - 1); + for(j = 1; j < ny - 1; j++) + { + eta = ((double) j) / (ny0 - 1); + for(i = 1; i < nx - 1; i++) + { + xi = ((double) i) / (nx0 - 1); + exact(0, j, k, ue_1jk); + exact(nx0 - 1, j, k, ue_nx0jk); + exact(i, 0, k, ue_i1k); + exact(i, ny0 - 1, k, ue_iny0k); + exact(i, j, 0, ue_ij1); + exact(i, j, nz - 1, ue_ijnz); + for(m = 0; m < 5; m++) + { + pxi = (1.0 - xi) * ue_1jk[m] + xi * ue_nx0jk[m]; + peta = (1.0 - eta) * ue_i1k[m] + eta * ue_iny0k[m]; + pzeta = (1.0 - zeta) * ue_ij1[m] + zeta * ue_ijnz[m]; + u[k][j][i][m] = pxi + peta + pzeta - pxi * peta - peta * pzeta - + pzeta * pxi + pxi * peta * pzeta; + } + } + } + } +} + +/* + * --------------------------------------------------------------------- + * to perform pseudo-time stepping SSOR iterations + * for five nonlinear pde's. + * --------------------------------------------------------------------- + */ +void +ssor(int niter) +{ + /* + * --------------------------------------------------------------------- + * local variables + * --------------------------------------------------------------------- + */ + int i, j, k, m, n; + int istep; + double tmp, tv[ISIZ2 * (ISIZ1 / 2 * 2 + 1) * 5]; + double delunm[5]; + /* + * --------------------------------------------------------------------- + * begin pseudo-time stepping iterations + * --------------------------------------------------------------------- + */ + tmp = 1.0 / (omega * (2.0 - omega)); + +/* + * --------------------------------------------------------------------- + * initialize a,b,c,d to zero (guarantees that page tables have been + * formed, if applicable on given architecture, before timestepping). + * --------------------------------------------------------------------- + */ +#pragma omp parallel for private(i, j, n, m) + for(j = 0; j < ISIZ2; j++) + { + for(i = 0; i < ISIZ1; i++) + { + for(n = 0; n < 5; n++) + { + for(m = 0; m < 5; m++) + { + a[j][i][n][m] = 0.0; + b[j][i][n][m] = 0.0; + c[j][i][n][m] = 0.0; + d[j][i][n][m] = 0.0; + } + } + } + } + for(i = 1; i <= T_LAST; i++) + { + timer_clear(i); + } + +#pragma omp parallel + { + /* + * --------------------------------------------------------------------- + * compute the steady-state residuals + * --------------------------------------------------------------------- + */ + rhs(); + + /* + * --------------------------------------------------------------------- + * compute the L2 norms of newton iteration residuals + * --------------------------------------------------------------------- + */ + l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm); + } /* end parallel */ + + for(i = 1; i <= T_LAST; i++) + { + timer_clear(i); + } + timer_start(1); + +#pragma omp parallel private(istep, i, j, k, m) + { + /* + * --------------------------------------------------------------------- + * the timestep loop + * --------------------------------------------------------------------- + */ + for(istep = 1; istep <= niter; istep++) + { + if((istep % 20) == 0 || istep == itmax || istep == 1) + { +#pragma omp master + if(niter > 1) + { + printf(" Time step %4d\n", istep); + } + } + /* + * --------------------------------------------------------------------- + * perform SSOR iteration + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_RHS); + } +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + rsd[k][j][i][m] = dt * rsd[k][j][i][m]; + } + } + } + } + if(timeron) + { +#pragma omp master + timer_stop(T_RHS); + } + + for(k = 1; k < nz - 1; k++) + { + /* + * --------------------------------------------------------------------- + * form the lower triangular part of the jacobian matrix + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_JACLD); + } + jacld(k); + if(timeron) + { +#pragma omp master + timer_stop(T_JACLD); + } + + /* + * --------------------------------------------------------------------- + * perform the lower triangular solution + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_BLTS); + } + + blts(nx, ny, nz, k, omega, rsd, a, b, c, d, ist, iend, jst, jend, nx0, + ny0); + + if(timeron) + { +#pragma omp master + timer_stop(T_BLTS); + } + } + +#pragma omp barrier + + for(k = nz - 2; k > 0; k--) + { + /* + * --------------------------------------------------------------------- + * form the strictly upper triangular part of the jacobian matrix + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_JACU); + } + jacu(k); + if(timeron) + { +#pragma omp master + timer_stop(T_JACU); + } + /* + * --------------------------------------------------------------------- + * perform the upper triangular solution + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_BUTS); + } + + buts(nx, ny, nz, k, omega, rsd, tv, d, a, b, c, ist, iend, jst, jend, nx0, + ny0); + + if(timeron) + { +#pragma omp master + timer_stop(T_BUTS); + } + } + +#pragma omp barrier + + /* + * --------------------------------------------------------------------- + * update the variables + * --------------------------------------------------------------------- + */ + if(timeron) + { +#pragma omp master + timer_start(T_ADD); + } + +#pragma omp for + for(k = 1; k < nz - 1; k++) + { + for(j = jst; j < jend; j++) + { + for(i = ist; i < iend; i++) + { + for(m = 0; m < 5; m++) + { + u[k][j][i][m] = u[k][j][i][m] + tmp * rsd[k][j][i][m]; + } + } + } + } + if(timeron) + { +#pragma omp master + timer_stop(T_ADD); + } + /* + * --------------------------------------------------------------------- + * compute the max-norms of newton iteration corrections + * --------------------------------------------------------------------- + */ + + if((istep % inorm) == 0) + { + if(timeron) + { +#pragma omp master + timer_start(T_L2NORM); + } + l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, delunm); + if(timeron) + { +#pragma omp master + timer_stop(T_L2NORM); + } + } + /* + * --------------------------------------------------------------------- + * compute the steady-state residuals + * --------------------------------------------------------------------- + */ + rhs(); + + /* + * --------------------------------------------------------------------- + * compute the max-norms of newton iteration residuals + * --------------------------------------------------------------------- + */ + if(((istep % inorm) == 0) || (istep == itmax)) + { + if(timeron) + { +#pragma omp master + timer_start(T_L2NORM); + } + l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm); + if(timeron) + { +#pragma omp master + timer_stop(T_L2NORM); + } + } + /* + * --------------------------------------------------------------------- + * check the newton-iteration residuals against the tolerance levels + * --------------------------------------------------------------------- + */ + if((rsdnm[0] < tolrsd[0]) && (rsdnm[1] < tolrsd[1]) && + (rsdnm[2] < tolrsd[2]) && (rsdnm[3] < tolrsd[3]) && (rsdnm[4] < tolrsd[4])) + { +#pragma omp master + printf(" \n convergence was achieved after %4d pseudo-time steps\n", + istep); + break; + } + } + } /* end parallel */ + + timer_stop(1); + maxtime = timer_read(1); +} + +/* + * --------------------------------------------------------------------- + * verification routine + * --------------------------------------------------------------------- + */ +void +verify(double xcr[], double xce[], double xci, char* class_npb, boolean* verified) +{ + double xcrref[5], xceref[5], xciref; + double xcrdif[5], xcedif[5], xcidif; + double epsilon, dtref = 0.0; + int m; + /* + * --------------------------------------------------------------------- + * tolerance level + * --------------------------------------------------------------------- + */ + epsilon = 1.0e-08; + *class_npb = 'U'; + *verified = TRUE; + for(m = 0; m < 5; m++) + { + xcrref[m] = 1.0; + xceref[m] = 1.0; + } + xciref = 1.0; + if((nx0 == 12) && (ny0 == 12) && (nz0 == 12) && (itmax == 50)) + { + *class_npb = 'S'; + dtref = 5.0e-1; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (12X12X12) grid, + * after 50 time steps, with DT = 5.0d-01 + * --------------------------------------------------------------------- + */ + xcrref[0] = 1.6196343210976702e-02; + xcrref[1] = 2.1976745164821318e-03; + xcrref[2] = 1.5179927653399185e-03; + xcrref[3] = 1.5029584435994323e-03; + xcrref[4] = 3.4264073155896461e-02; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (12X12X12) grid, + * after 50 time steps, with DT = 5.0d-01 + * --------------------------------------------------------------------- + */ + xceref[0] = 6.4223319957960924e-04; + xceref[1] = 8.4144342047347926e-05; + xceref[2] = 5.8588269616485186e-05; + xceref[3] = 5.8474222595157350e-05; + xceref[4] = 1.3103347914111294e-03; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (12X12X12) grid, + * after 50 time steps, with DT = 5.0d-01 + * --------------------------------------------------------------------- + */ + xciref = 7.8418928865937083e+00; + } + else if((nx0 == 33) && (ny0 == 33) && (nz0 == 33) && (itmax == 300)) + { + *class_npb = 'W'; /* SPEC95fp size */ + dtref = 1.5e-3; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (33x33x33) grid, + * after 300 time steps, with DT = 1.5d-3 + * --------------------------------------------------------------------- + */ + xcrref[0] = 0.1236511638192e+02; + xcrref[1] = 0.1317228477799e+01; + xcrref[2] = 0.2550120713095e+01; + xcrref[3] = 0.2326187750252e+01; + xcrref[4] = 0.2826799444189e+02; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (33X33X33) grid, + * --------------------------------------------------------------------- + */ + xceref[0] = 0.4867877144216e+00; + xceref[1] = 0.5064652880982e-01; + xceref[2] = 0.9281818101960e-01; + xceref[3] = 0.8570126542733e-01; + xceref[4] = 0.1084277417792e+01; + /* + * --------------------------------------------------------------------- + * rReference value of surface integral, for the (33X33X33) grid, + * after 300 time steps, with DT = 1.5d-3 + * --------------------------------------------------------------------- + */ + xciref = 0.1161399311023e+02; + } + else if((nx0 == 64) && (ny0 == 64) && (nz0 == 64) && (itmax == 250)) + { + *class_npb = 'A'; + dtref = 2.0e+0; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (64X64X64) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xcrref[0] = 7.7902107606689367e+02; + xcrref[1] = 6.3402765259692870e+01; + xcrref[2] = 1.9499249727292479e+02; + xcrref[3] = 1.7845301160418537e+02; + xcrref[4] = 1.8384760349464247e+03; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (64X64X64) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xceref[0] = 2.9964085685471943e+01; + xceref[1] = 2.8194576365003349e+00; + xceref[2] = 7.3473412698774742e+00; + xceref[3] = 6.7139225687777051e+00; + xceref[4] = 7.0715315688392578e+01; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (64X64X64) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xciref = 2.6030925604886277e+01; + } + else if((nx0 == 102) && (ny0 == 102) && (nz0 == 102) && (itmax == 250)) + { + *class_npb = 'B'; + dtref = 2.0e+0; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (102X102X102) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xcrref[0] = 3.5532672969982736e+03; + xcrref[1] = 2.6214750795310692e+02; + xcrref[2] = 8.8333721850952190e+02; + xcrref[3] = 7.7812774739425265e+02; + xcrref[4] = 7.3087969592545314e+03; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (102X102X102) + * grid, after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xceref[0] = 1.1401176380212709e+02; + xceref[1] = 8.1098963655421574e+00; + xceref[2] = 2.8480597317698308e+01; + xceref[3] = 2.5905394567832939e+01; + xceref[4] = 2.6054907504857413e+02; + /* + c--------------------------------------------------------------------- + * reference value of surface integral, for the (102X102X102) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xciref = 4.7887162703308227e+01; + } + else if((nx0 == 162) && (ny0 == 162) && (nz0 == 162) && (itmax == 250)) + { + *class_npb = 'C'; + dtref = 2.0e+0; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (162X162X162) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xcrref[0] = 1.03766980323537846e+04; + xcrref[1] = 8.92212458801008552e+02; + xcrref[2] = 2.56238814582660871e+03; + xcrref[3] = 2.19194343857831427e+03; + xcrref[4] = 1.78078057261061185e+04; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (162X162X162) + * grid, after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xceref[0] = 2.15986399716949279e+02; + xceref[1] = 1.55789559239863600e+01; + xceref[2] = 5.41318863077207766e+01; + xceref[3] = 4.82262643154045421e+01; + xceref[4] = 4.55902910043250358e+02; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (162X162X162) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xciref = 6.66404553572181300e+01; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (162X162X162) grid, + * after 250 time steps, with DT = 2.0d+00 + * --------------------------------------------------------------------- + */ + xciref = 6.66404553572181300e+01; + } + else if((nx0 == 408) && (ny0 == 408) && (nz0 == 408) && (itmax == 300)) + { + *class_npb = 'D'; + dtref = 1.0e+0; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (408X408X408) grid, + * after 300 time steps, with DT = 1.0d+00 + * --------------------------------------------------------------------- + */ + xcrref[0] = 0.4868417937025e+05; + xcrref[1] = 0.4696371050071e+04; + xcrref[2] = 0.1218114549776e+05; + xcrref[3] = 0.1033801493461e+05; + xcrref[4] = 0.7142398413817e+05; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (408X408X408) + * grid, after 300 time steps, with DT = 1.0d+00 + * --------------------------------------------------------------------- + */ + xceref[0] = 0.3752393004482e+03; + xceref[1] = 0.3084128893659e+02; + xceref[2] = 0.9434276905469e+02; + xceref[3] = 0.8230686681928e+02; + xceref[4] = 0.7002620636210e+03; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (408X408X408) grid, + * after 300 time steps, with DT = 1.0d+00 + * --------------------------------------------------------------------- + */ + xciref = 0.8334101392503e+02; + } + else if((nx0 == 1020) && (ny0 == 1020) && (nz0 == 1020) && (itmax == 300)) + { + *class_npb = 'E'; + dtref = 0.5e+0; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of residual, for the (1020X1020X1020) grid, + * after 300 time steps, with DT = 0.5d+00 + * --------------------------------------------------------------------- + */ + xcrref[0] = 0.2099641687874e+06; + xcrref[1] = 0.2130403143165e+05; + xcrref[2] = 0.5319228789371e+05; + xcrref[3] = 0.4509761639833e+05; + xcrref[4] = 0.2932360006590e+06; + /* + * --------------------------------------------------------------------- + * reference values of RMS-norms of solution error, for the (1020X1020X1020) + * grid, after 300 time steps, with DT = 0.5d+00 + * --------------------------------------------------------------------- + */ + xceref[0] = 0.4800572578333e+03; + xceref[1] = 0.4221993400184e+02; + xceref[2] = 0.1210851906824e+03; + xceref[3] = 0.1047888986770e+03; + xceref[4] = 0.8363028257389e+03; + /* + * --------------------------------------------------------------------- + * reference value of surface integral, for the (1020X1020X1020) grid, + * after 300 time steps, with DT = 0.5d+00 + * --------------------------------------------------------------------- + */ + xciref = 0.9512163272273e+02; + } + else + { + *verified = FALSE; + } + /* + * --------------------------------------------------------------------- + * verification test for residuals if gridsize is one of + * the defined grid sizes above (class .ne. 'U') + * --------------------------------------------------------------------- + * compute the difference of solution values and the known reference values. + * --------------------------------------------------------------------- + */ + for(m = 0; m < 5; m++) + { + xcrdif[m] = fabs((xcr[m] - xcrref[m]) / xcrref[m]); + xcedif[m] = fabs((xce[m] - xceref[m]) / xceref[m]); + } + xcidif = fabs((xci - xciref) / xciref); + /* + * --------------------------------------------------------------------- + * output the comparison of computed results to known cases. + * --------------------------------------------------------------------- + */ + if(*class_npb != 'U') + { + printf("\n Verification being performed for class_npb %c\n", *class_npb); + printf(" Accuracy setting for epsilon = %20.13E\n", epsilon); + *verified = (fabs(dt - dtref) <= epsilon); + if(!(*verified)) + { + *class_npb = 'U'; + printf(" DT does not match the reference value of %15.8E\n", dtref); + } + } + else + { + printf(" Unknown class_npb\n"); + } + if(*class_npb != 'U') + { + printf(" Comparison of RMS-norms of residual\n"); + } + else + { + printf(" RMS-norms of residual\n"); + } + for(m = 0; m < 5; m++) + { + if(*class_npb == 'U') + { + printf(" %2d %20.13E\n", m + 1, xcr[m]); + } + else if(xcrdif[m] <= epsilon) + { + printf(" %2d %20.13E%20.13E%20.13E\n", m + 1, xcr[m], xcrref[m], + xcrdif[m]); + } + else + { + *verified = FALSE; + printf(" FAILURE: %2d %20.13E%20.13E%20.13E\n", m + 1, xcr[m], xcrref[m], + xcrdif[m]); + } + } + if(*class_npb != 'U') + { + printf(" Comparison of RMS-norms of solution error\n"); + } + else + { + printf(" RMS-norms of solution error\n"); + } + for(m = 0; m < 5; m++) + { + if(*class_npb == 'U') + { + printf(" %2d %20.13E\n", m + 1, xce[m]); + } + else if(xcedif[m] <= epsilon) + { + printf(" %2d %20.13E%20.13E%20.13E\n", m + 1, xce[m], xceref[m], + xcedif[m]); + } + else + { + *verified = FALSE; + printf(" FAILURE: %2d %20.13E%20.13E%20.13E\n", m + 1, xce[m], xceref[m], + xcedif[m]); + } + } + if(*class_npb != 'U') + { + printf(" Comparison of surface integral\n"); + } + else + { + printf(" Surface integral\n"); + } + if(*class_npb == 'U') + { + printf(" %20.13E\n", xci); + } + else if(xcidif <= epsilon) + { + printf(" %20.13E%20.13E%20.13E\n", xci, xciref, xcidif); + } + else + { + *verified = FALSE; + printf(" FAILURE: %20.13E%20.13E%20.13E\n", xci, xciref, xcidif); + } + if(*class_npb == 'U') + { + printf(" No reference values provided\n"); + printf("No verification performed\n"); + } + else if(*verified) + { + printf(" Verification Successful\n"); + } + else + { + printf(" Verification failed\n"); + } +} diff --git a/examples/openmp/LU/npbparams.hpp b/examples/openmp/LU/npbparams.hpp new file mode 100644 index 0000000000..dbb8c25eac --- /dev/null +++ b/examples/openmp/LU/npbparams.hpp @@ -0,0 +1,27 @@ +/* CLASS = W */ +/* + c This file is generated automatically by the setparams utility. + c It sets the number of processors and the class_npb of the NPB + c in this directory. Do not modify it by hand. + */ + +/* full problem size */ +#define ISIZ1 25 +#define ISIZ2 25 +#define ISIZ3 25 +/* number of iterations and how often to print the norm */ +#define ITMAX_DEFAULT 150 +#define INORM_DEFAULT 150 +#define DT_DEFAULT 1.5e-3 +#define CONVERTDOUBLE FALSE +#define COMPILETIME "07 Mar 2022" +#define NPBVERSION "4.1" +#define LIBVERSION "201511" +#define COMPILERVERSION "11.1.0" +#define CS1 "g++ -std=c++14" +#define CS2 "$(CC)" +#define CS3 "-lm" +#define CS4 "-I../common " +#define CS5 "-O3 -fopenmp -mcmodel=medium" +#define CS6 "-O3 -fopenmp -mcmodel=medium" +#define CS7 "randdp" diff --git a/examples/openmp/common/c_print_results.cpp b/examples/openmp/common/c_print_results.cpp new file mode 100644 index 0000000000..2dbfb2e6e5 --- /dev/null +++ b/examples/openmp/common/c_print_results.cpp @@ -0,0 +1,159 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +#include +#include +#include + +/*****************************************************************/ +/****** C _ P R I N T _ R E S U L T S ******/ +/*****************************************************************/ +void +c_print_results(char* name, char class_npb, int n1, int n2, int n3, int niter, double t, + double mops, char* optype, int passed_verification, char* npbversion, + char* compiletime, char* compilerversion, char* libversion, + char* totalthreads, char* cc, char* clink, char* c_lib, char* c_inc, + char* cflags, char* clinkflags, char* rand) +{ + printf("\n\n %s Benchmark Completed\n", name); + printf(" class_npb = %c\n", class_npb); + if((name[0] == 'I') && (name[1] == 'S')) + { + if(n3 == 0) + { + long nn = n1; + if(n2 != 0) + { + nn *= n2; + } + printf(" Size = %12ld\n", nn); /* as in IS */ + } + else + { + printf(" Size = %4dx%4dx%4d\n", n1, n2, n3); + } + } + else + { + char size[16]; + int j; + if((n2 == 0) && (n3 == 0)) + { + if((name[0] == 'E') && (name[1] == 'P')) + { + sprintf(size, "%15.0lf", pow(2.0, n1)); + j = 14; + if(size[j] == '.') + { + size[j] = ' '; + j--; + } + size[j + 1] = '\0'; + printf(" Size = %15s\n", size); + } + else + { + printf(" Size = %12d\n", n1); + } + } + else + { + printf(" Size = %4dx%4dx%4d\n", n1, n2, n3); + } + } + printf(" Total threads = %12s\n", totalthreads); + printf(" Iterations = %12d\n", niter); + printf(" Time in seconds = %12.2f\n", t); + printf(" Mop/s total = %12.2f\n", mops); + printf(" Operation type = %24s\n", optype); + if(passed_verification < 0) + { + printf(" Verification = NOT PERFORMED\n"); + } + else if(passed_verification) + { + printf(" Verification = SUCCESSFUL\n"); + } + else + { + printf(" Verification = UNSUCCESSFUL\n"); + } + printf(" Version = %12s\n", npbversion); + printf(" Compile date = %12s\n", compiletime); + printf(" Compiler ver = %12s\n", compilerversion); + printf(" OpenMP version = %12s\n", libversion); + printf("\n Compile options:\n"); + printf(" CC = %s\n", cc); + printf(" CLINK = %s\n", clink); + printf(" C_LIB = %s\n", c_lib); + printf(" C_INC = %s\n", c_inc); + printf(" CFLAGS = %s\n", cflags); + printf(" CLINKFLAGS = %s\n", clinkflags); + printf(" RAND = %s\n", rand); +#ifdef SMP + evalue = getenv("MP_SET_NUMTHREADS"); + printf(" MULTICPUS = %s\n", evalue); +#endif + /* + * printf(" Please send the results of this run to:\n\n"); + * printf(" NPB Development Team\n"); + * printf(" Internet: npb@nas.nasa.gov\n \n"); + * printf(" If email is not available, send this to:\n\n"); + * printf(" MS T27A-1\n"); + * printf(" NASA Ames Research Center\n"); + * printf(" Moffett Field, CA 94035-1000\n\n"); + * printf(" Fax: 650-604-3957\n\n"); + */ + printf("\n\n"); + + printf("----------------------------------------------------------------------\n"); + printf(" NPB-CPP is developed by: \n"); + printf(" Dalvan Griebler\n"); + printf(" Gabriell Araujo (Sequential Porting)\n"); + printf(" Júnior Löff (Parallel Implementation)\n"); + printf("\n"); + printf(" In case of questions or problems, please send an e-mail to us:\n"); + printf(" dalvan.griebler; gabriell.araujo; junior.loff@edu.pucrs.br\n"); + printf("----------------------------------------------------------------------\n"); + printf("\n"); +} diff --git a/examples/openmp/common/c_randdp.cpp b/examples/openmp/common/c_randdp.cpp new file mode 100644 index 0000000000..74ba992d36 --- /dev/null +++ b/examples/openmp/common/c_randdp.cpp @@ -0,0 +1,184 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +#if defined(USE_POW) +# define r23 pow(0.5, 23.0) +# define r46 (r23 * r23) +# define t23 pow(2.0, 23.0) +# define t46 (t23 * t23) +#else +# define r23 \ + (0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * \ + 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5 * 0.5) +# define r46 (r23 * r23) +# define t23 \ + (2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * \ + 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0 * 2.0) +# define t46 (t23 * t23) +#endif + +/* + * --------------------------------------------------------------------- + * + * this routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. this scheme generates 2^44 numbers + * before repeating. the argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). the returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * this routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On 64 bit + * systems, double precision should be disabled. + * + * David H. Bailey, October 26, 1990 + * + * --------------------------------------------------------------------- + */ +double +randlc(double* x, double a) +{ + double t1, t2, t3, t4, a1, a2, x1, x2, z; + + /* + * --------------------------------------------------------------------- + * break A into two parts such that A = 2^23 * A1 + A2. + * --------------------------------------------------------------------- + */ + t1 = r23 * a; + a1 = (int) t1; + a2 = a - t23 * a1; + + /* + * --------------------------------------------------------------------- + * break X into two parts such that X = 2^23 * X1 + X2, compute + * Z = A1 * X2 + A2 * X1 (mod 2^23), and then + * X = 2^23 * Z + A2 * X2 (mod 2^46). + * --------------------------------------------------------------------- + */ + t1 = r23 * (*x); + x1 = (int) t1; + x2 = (*x) - t23 * x1; + t1 = a1 * x2 + a2 * x1; + t2 = (int) (r23 * t1); + z = t1 - t23 * t2; + t3 = t23 * z + a2 * x2; + t4 = (int) (r46 * t3); + (*x) = t3 - t46 * t4; + + return (r46 * (*x)); +} + +/* + * --------------------------------------------------------------------- + * + * this routine generates N uniform pseudorandom double precision numbers in + * the range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. this scheme generates 2^44 numbers + * before repeating. the argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). the N results are placed in Y and are normalized + * to be between 0 and 1. X is updated to contain the new seed, so that + * subsequent calls to VRANLC using the same arguments will generate a + * continuous sequence. if N is zero, only initialization is performed, and + * the variables X, A and Y are ignored. + * + * this routine is the standard version designed for scalar or RISC systems. + * however, it should produce the same results on any single processor + * computer with at least 48 mantissa bits in double precision floating point + * data. on 64 bit systems, double precision should be disabled. + * + * --------------------------------------------------------------------- + */ +void +vranlc(int n, double* x_seed, double a, double y[]) +{ + int i; + double x, t1, t2, t3, t4, a1, a2, x1, x2, z; + + /* + * --------------------------------------------------------------------- + * break A into two parts such that A = 2^23 * A1 + A2. + * --------------------------------------------------------------------- + */ + t1 = r23 * a; + a1 = (int) t1; + a2 = a - t23 * a1; + x = *x_seed; + + /* + * --------------------------------------------------------------------- + * generate N results. this loop is not vectorizable. + * --------------------------------------------------------------------- + */ + for(i = 0; i < n; i++) + { + /* + * --------------------------------------------------------------------- + * break X into two parts such that X = 2^23 * X1 + X2, compute + * Z = A1 * X2 + A2 * X1 (mod 2^23), and then + * X = 2^23 * Z + A2 * X2 (mod 2^46). + * --------------------------------------------------------------------- + */ + t1 = r23 * x; + x1 = (int) t1; + x2 = x - t23 * x1; + t1 = a1 * x2 + a2 * x1; + t2 = (int) (r23 * t1); + z = t1 - t23 * t2; + t3 = t23 * z + a2 * x2; + t4 = (int) (r46 * t3); + x = t3 - t46 * t4; + y[i] = r46 * x; + } + *x_seed = x; +} \ No newline at end of file diff --git a/examples/openmp/common/c_timers.cpp b/examples/openmp/common/c_timers.cpp new file mode 100644 index 0000000000..fbcdb4bfe2 --- /dev/null +++ b/examples/openmp/common/c_timers.cpp @@ -0,0 +1,101 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +#include "wtime.hpp" +#include + +/* prototype */ +void +wtime(double*); + +/*****************************************************************/ +/****** E L A P S E D _ T I M E ******/ +/*****************************************************************/ +double +elapsed_time(void) +{ + double t; + wtime(&t); + return (t); +} + +double start[64], elapsed[64]; + +/*****************************************************************/ +/****** T I M E R _ C L E A R ******/ +/*****************************************************************/ +void +timer_clear(int n) +{ + elapsed[n] = 0.0; +} + +/*****************************************************************/ +/****** T I M E R _ S T A R T ******/ +/*****************************************************************/ +void +timer_start(int n) +{ + start[n] = elapsed_time(); +} + +/*****************************************************************/ +/****** T I M E R _ S T O P ******/ +/*****************************************************************/ +void +timer_stop(int n) +{ + double t, now; + now = elapsed_time(); + t = now - start[n]; + elapsed[n] += t; +} + +/*****************************************************************/ +/****** T I M E R _ R E A D ******/ +/*****************************************************************/ +double +timer_read(int n) +{ + return (elapsed[n]); +} diff --git a/examples/openmp/common/npb-CPP.hpp b/examples/openmp/common/npb-CPP.hpp new file mode 100644 index 0000000000..cb50667d75 --- /dev/null +++ b/examples/openmp/common/npb-CPP.hpp @@ -0,0 +1,124 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +#include +#include +#include + +typedef int boolean; +typedef struct +{ + double real; + double imag; +} dcomplex; + +#define TRUE 1 +#define FALSE 0 + +#define max(a, b) (((a) > (b)) ? (a) : (b)) +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#define pow2(a) ((a) * (a)) + +/* old version of the complex number operations */ +#define get_real(c) c.real +#define get_imag(c) c.imag +#define cadd(c, a, b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) +#define csub(c, a, b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) +#define cmul(c, a, b) \ + (c.real = a.real * b.real - a.imag * b.imag, \ + c.imag = a.real * b.imag + a.imag * b.real) +#define crmul(c, a, b) (c.real = a.real * b, c.imag = a.imag * b) + +/* latest version of the complex number operations */ +#define dcomplex_create(r, i) \ + (dcomplex) { r, i } +#define dcomplex_add(a, b) \ + (dcomplex) { (a).real + (b).real, (a).imag + (b).imag } +#define dcomplex_sub(a, b) \ + (dcomplex) { (a).real - (b).real, (a).imag - (b).imag } +#define dcomplex_mul(a, b) \ + (dcomplex) \ + { \ + ((a).real * (b).real) - ((a).imag * (b).imag), \ + ((a).real * (b).imag) + ((a).imag * (b).real) \ + } +#define dcomplex_mul2(a, b) \ + (dcomplex) { (a).real*(b), (a).imag*(b) } +static inline dcomplex +dcomplex_div(dcomplex z1, dcomplex z2) +{ + double a = z1.real; + double b = z1.imag; + double c = z2.real; + double d = z2.imag; + double divisor = c * c + d * d; + double real = (a * c + b * d) / divisor; + double imag = (b * c - a * d) / divisor; + dcomplex result = (dcomplex){ real, imag }; + return result; +} +#define dcomplex_div2(a, b) \ + (dcomplex) { (a).real / (b), (a).imag / (b) } +#define dcomplex_abs(x) sqrt(((x).real * (x).real) + ((x).imag * (x).imag)) +#define dconjg(x) \ + (dcomplex) { (x).real, -1.0 * (x).imag } + +extern double +randlc(double*, double); +extern void +vranlc(int, double*, double, double*); +extern void +timer_clear(int); +extern void +timer_start(int); +extern void +timer_stop(int); +extern double +timer_read(int); + +extern void +c_print_results(char* name, char class_npb, int n1, int n2, int n3, int niter, double t, + double mops, char* optype, int passed_verification, char* npbversion, + char* compiletime, char* compilerversion, char* libversion, + char* totalthreads, char* cc, char* clink, char* c_lib, char* c_inc, + char* cflags, char* clinkflags, char* rand); diff --git a/examples/openmp/common/wtime.cpp b/examples/openmp/common/wtime.cpp new file mode 100644 index 0000000000..56a9e059a6 --- /dev/null +++ b/examples/openmp/common/wtime.cpp @@ -0,0 +1,55 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +#include "wtime.hpp" +#include + +void +wtime(double* t) +{ + static int sec = -1; + struct timeval tv; + gettimeofday(&tv, 0); + if(sec < 0) sec = tv.tv_sec; + *t = (tv.tv_sec - sec) + 1.0e-6 * tv.tv_usec; +} diff --git a/examples/openmp/common/wtime.hpp b/examples/openmp/common/wtime.hpp new file mode 100644 index 0000000000..e8c39e42eb --- /dev/null +++ b/examples/openmp/common/wtime.hpp @@ -0,0 +1,54 @@ +/* +MIT License + +Copyright (c) 2021 Parallel Applications Modelling Group - GMAP + GMAP website: https://gmap.pucrs.br + + Pontifical Catholic University of Rio Grande do Sul (PUCRS) + Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------ + +The original NPB 3.4.1 version was written in Fortran and belongs to: + http://www.nas.nasa.gov/Software/NPB/ + +------------------------------------------------------------------------------ + +The serial C++ version is a translation of the original NPB 3.4.1 +Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER + +Authors of the C++ code: + Dalvan Griebler + Gabriell Araujo + Júnior Löff +*/ + +/* + * C/Fortran interface is different on different machines. + * you may need to tweak this. + */ +#if defined(IBM) +# define wtime wtime +#elif defined(CRAY) +# define wtime WTIME +#else +# define wtime wtime_ +#endif diff --git a/examples/openmp/openmp.cpp b/examples/openmp/openmp.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/transpose/transpose.cpp b/examples/transpose/transpose.cpp index 1924a8092e..44658d74bd 100644 --- a/examples/transpose/transpose.cpp +++ b/examples/transpose/transpose.cpp @@ -95,7 +95,7 @@ transpose_a(int* in, int* out, int M, int N) void run(int rank, int tid, hipStream_t stream, int argc, char** argv) { - size_t nitr = 5000; + size_t nitr = 500; unsigned int M = 4960 * 2; unsigned int N = 4960 * 2; if(argc > 2) nitr = atoll(argv[2]); diff --git a/examples/user-api/CMakeLists.txt b/examples/user-api/CMakeLists.txt new file mode 100644 index 0000000000..31118606da --- /dev/null +++ b/examples/user-api/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) + +project(omnitrace-user-api LANGUAGES CXX) + +set(CMAKE_BUILD_TYPE "Release") +find_package(Threads REQUIRED) +add_executable(user-api user-api.cpp) +target_link_libraries(user-api PRIVATE Threads::Threads omnitrace::omnitrace-user-library) + +if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + set_target_properties(user-api PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}) +endif() diff --git a/examples/user-api/user-api.cpp b/examples/user-api/user-api.cpp new file mode 100644 index 0000000000..15ecc318ad --- /dev/null +++ b/examples/user-api/user-api.cpp @@ -0,0 +1,80 @@ + +#include + +#include +#include +#include +#include +#include +#include + +std::atomic total{ 0 }; + +long +fib(long n) __attribute__((noinline)); + +void +run(size_t nitr, long) __attribute__((noinline)); + +long +fib(long n) +{ + return (n < 2) ? n : fib(n - 1) + fib(n - 2); +} + +#define RUN_LABEL \ + std::string{ std::string{ __FUNCTION__ } + "(" + std::to_string(n) + ") x " + \ + std::to_string(nitr) } \ + .c_str() + +void +run(size_t nitr, long n) +{ + omnitrace_user_stop_thread_trace(); + omnitrace_user_push_region(RUN_LABEL); + long local = 0; + for(size_t i = 0; i < nitr; ++i) + local += fib(n); + total += local; + omnitrace_user_pop_region(RUN_LABEL); + omnitrace_user_start_thread_trace(); +} + +int +main(int argc, char** argv) +{ + omnitrace_user_push_region(argv[0]); + omnitrace_user_push_region("initialization"); + size_t nthread = std::min(16, std::thread::hardware_concurrency()); + size_t nitr = 50000; + long nfib = 10; + if(argc > 1) nfib = atol(argv[1]); + if(argc > 2) nthread = atol(argv[2]); + if(argc > 3) nitr = atol(argv[3]); + omnitrace_user_pop_region("initialization"); + + printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", argv[0], + nthread, argv[0], nitr, argv[0], nfib); + + omnitrace_user_push_region("thread_creation"); + std::vector threads{}; + threads.reserve(nthread); + for(size_t i = 0; i < nthread; ++i) + { + size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr)); + threads.emplace_back(&run, _nitr, nfib); + } + omnitrace_user_pop_region("thread_creation"); + + run(nitr - 0.25 * nitr, nfib - 0.1 * nfib); + + omnitrace_user_push_region("thread_wait"); + for(auto& itr : threads) + itr.join(); + omnitrace_user_pop_region("thread_wait"); + + printf("[%s] fibonacci(%li) x %lu = %li\n", argv[0], nfib, nthread, total.load()); + omnitrace_user_pop_region(argv[0]); + + return 0; +} diff --git a/external/PTL b/external/PTL index 61f873cf79..4afd2bdeb9 160000 --- a/external/PTL +++ b/external/PTL @@ -1 +1 @@ -Subproject commit 61f873cf79a016b0572c04f2df075a75a66389aa +Subproject commit 4afd2bdeb9ba0ff5c988a31f901f1629134cb109 diff --git a/external/timemory b/external/timemory index b8d3b3e1fd..1ea2511833 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit b8d3b3e1fd74384c927119c7bf1a9790dd25ca90 +Subproject commit 1ea25118334f9a3a541704d925631bf09dcf2e67 diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index ddf0cd5d34..0a3c81ac5d 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -12,3 +12,13 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) add_subdirectory(lib) add_subdirectory(bin) + +if(OMNITRACE_BUILD_DEVELOPER) + add_custom_target(omnitrace-precommit) + foreach(_TARGET format-omnitrace-source format-omnitrace-cmake format-timemory-source + format-timemory-cmake) + if(TARGET ${_TARGET}) + add_dependencies(omnitrace-precommit ${_TARGET}) + endif() + endforeach() +endif() diff --git a/source/bin/CMakeLists.txt b/source/bin/CMakeLists.txt index 4a531b5eb4..94b02f4721 100644 --- a/source/bin/CMakeLists.txt +++ b/source/bin/CMakeLists.txt @@ -1,3 +1,7 @@ add_subdirectory(omnitrace-avail) add_subdirectory(omnitrace-critical-trace) add_subdirectory(omnitrace) + +if(OMNITRACE_BUILD_TESTING) + add_subdirectory(tests) +endif() diff --git a/source/bin/omnitrace-avail/avail.cpp b/source/bin/omnitrace-avail/avail.cpp index ad53065f33..e04d047bf1 100644 --- a/source/bin/omnitrace-avail/avail.cpp +++ b/source/bin/omnitrace-avail/avail.cpp @@ -30,6 +30,7 @@ #include "library/components/omnitrace.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/components/roctracer.hpp" +#include "library/components/user_region.hpp" #include "library/config.hpp" #include @@ -613,8 +614,6 @@ main(int argc, char** argv) if(!os) os = &std::cout; - omnitrace_init_library(); - if(include_components) write_component_info(*os, options, use_mark, fields); dump_log(); @@ -766,7 +765,9 @@ write_component_info(std::ostream& os, const array_t& options, _mark.at(i)); } - _selected += (is_category_selected(std::get<2>(itr).at(CATEGORY))) ? 1 : 0; + if(!category_regex_keys.empty()) + _selected += + (is_category_selected(std::get<2>(itr).at(CATEGORY))) ? 1 : 0; if(_selected == 0) continue; } @@ -834,7 +835,8 @@ write_component_info(std::ostream& os, const array_t& options, _mark.at(i)); } - _selected += (is_category_selected(std::get<2>(itr).at(CATEGORY))) ? 1 : 0; + if(!category_regex_keys.empty()) + _selected += (is_category_selected(std::get<2>(itr).at(CATEGORY))) ? 1 : 0; if(_selected > 0) { diff --git a/source/bin/omnitrace/CMakeLists.txt b/source/bin/omnitrace/CMakeLists.txt index 502ce19586..a54c67f823 100644 --- a/source/bin/omnitrace/CMakeLists.txt +++ b/source/bin/omnitrace/CMakeLists.txt @@ -4,10 +4,19 @@ # # ------------------------------------------------------------------------------# -add_executable( +add_executable(omnitrace-exe ${_EXCLUDE}) + +target_sources( omnitrace-exe - ${_EXCLUDE} ${CMAKE_CURRENT_LIST_DIR}/omnitrace.cpp - ${CMAKE_CURRENT_LIST_DIR}/omnitrace.hpp ${CMAKE_CURRENT_LIST_DIR}/details.cpp) + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/omnitrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/details.cpp + ${CMAKE_CURRENT_LIST_DIR}/function_signature.cpp + ${CMAKE_CURRENT_LIST_DIR}/module_function.cpp + ${CMAKE_CURRENT_LIST_DIR}/omnitrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/info.hpp + ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp + ${CMAKE_CURRENT_LIST_DIR}/function_signature.hpp + ${CMAKE_CURRENT_LIST_DIR}/module_function.hpp) target_link_libraries( omnitrace-exe diff --git a/source/bin/omnitrace/details.cpp b/source/bin/omnitrace/details.cpp index effb457d3a..f9c39cf6bf 100644 --- a/source/bin/omnitrace/details.cpp +++ b/source/bin/omnitrace/details.cpp @@ -40,7 +40,7 @@ get_whole_function_names() "backtrace", "backtrace_symbols", "backtrace_symbols_fd", "sigaddset", "sigandset", "sigdelset", "sigemptyset", "sigfillset", "sighold", "sigisemptyset", "sigismember", "sigorset", "sigrelse", "sigvec", "strtok", "strstr", "sbrk", - "strxfrm", + "strxfrm", "atexit", "ompt_start_tool", "nanosleep", // below are functions which never terminate "rocr::core::Signal::WaitAny", "rocr::core::Runtime::AsyncEventsLoop", "rocr::core::BusyWaitSignal::WaitAcquire", @@ -525,10 +525,32 @@ are_file_include_exclude_lists_empty() // the instrumented loop and formats it properly. // function_signature -get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* cfGraph, +get_loop_file_line_info(module_t* module, procedure_t* func, flow_graph_t* cfGraph, basic_loop_t* loopToInstrument) { - if(!cfGraph || !loopToInstrument || !f) return function_signature{ "", "", "" }; + if(!cfGraph || !loopToInstrument || !func) return function_signature{ "", "", "" }; + + std::vector basic_blocks{}; + loopToInstrument->getLoopBasicBlocksExclusive(basic_blocks); + + if(basic_blocks.empty()) return function_signature{ "", "", "" }; + + auto base_addr = basic_blocks.front()->getStartAddress(); + auto last_addr = basic_blocks.front()->getEndAddress(); + basic_block_t* block = basic_blocks.front(); + for(const auto& itr : basic_blocks) + { + if(itr == block) continue; + if(itr->dominates(block)) + { + base_addr = itr->getStartAddress(); + last_addr = itr->getEndAddress(); + block = itr; + } + } + + verbprintf(4, "Loop: size = %lu: base_addr = %lu, last_addr = %lu\n", + (unsigned long) (last_addr - base_addr), base_addr, last_addr); char fname[FUNCNAMELEN + 1]; char mname[FUNCNAMELEN + 1]; @@ -537,32 +559,14 @@ get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* memset(fname, '\0', FUNCNAMELEN + 1); memset(mname, '\0', FUNCNAMELEN + 1); - mutatee_module->getName(mname, FUNCNAMELEN); + module->getName(mname, FUNCNAMELEN); + func->getName(fname, FUNCNAMELEN); - bpvector_t* loopStartInst = - cfGraph->findLoopInstPoints(BPatch_locLoopStartIter, loopToInstrument); - bpvector_t* loopExitInst = - cfGraph->findLoopInstPoints(BPatch_locLoopEndIter, loopToInstrument); + auto* returnType = func->getReturnType(); - if(!loopStartInst || !loopExitInst) return function_signature{ "", "", "" }; + if(returnType) typeName = returnType->getName(); - unsigned long baseAddr = (unsigned long) (*loopStartInst)[0]->getAddress(); - unsigned long lastAddr = - (unsigned long) (*loopExitInst)[loopExitInst->size() - 1]->getAddress(); - verbprintf(3, "Loop: size of lastAddr = %lu: baseAddr = %lu, lastAddr = %lu\n", - (unsigned long) loopExitInst->size(), (unsigned long) baseAddr, - (unsigned long) lastAddr); - - f->getName(fname, FUNCNAMELEN); - - auto* returnType = f->getReturnType(); - - if(returnType) - { - typeName = returnType->getName(); - } - - auto* params = f->getParams(); + auto* params = func->getParams(); std::vector _params; if(params) { @@ -574,36 +578,51 @@ get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* } } - bpvector_t lines; - bpvector_t linesEnd; + bpvector_t lines{}; + bpvector_t linesEnd{}; - bool info1 = mutatee_module->getSourceLines(baseAddr, lines); + bool info1 = module->getSourceLines(base_addr, lines); string_t filename = mname; if(info1) { // filename = lines[0].fileName(); - auto row1 = lines[0].lineNumber(); - auto col1 = lines[0].lineOffset(); + int row1 = 0; + int col1 = 0; + for(auto& itr : lines) + { + if(itr.lineNumber() > 0) + { + row1 = itr.lineNumber(); + col1 = itr.lineOffset(); + break; + } + } + + if(row1 == 0 && col1 == 0) + return function_signature(typeName, fname, filename, _params); + + int row2 = 0; + int col2 = 0; + for(auto& itr : lines) + { + row2 = std::max(row2, itr.lineNumber()); + col2 = std::max(col2, itr.lineOffset()); + } + if(col1 < 0) col1 = 0; - // This following section is attempting to remedy the limitations of - // getSourceLines for loops. As the program goes through the loop, the resulting - // lines go from the loop head, through the instructions present in the loop, to - // the last instruction in the loop, back to the loop head, then to the next - // instruction outside of the loop. What this section does is starts at the last - // instruction in the loop, then goes through the addresses until it reaches the - // next instruction outside of the loop. We then bump back a line. This is not a - // perfect solution, but we will work with the Dyninst team to find something - // better. - bool info2 = mutatee_module->getSourceLines((unsigned long) lastAddr, linesEnd); - verbprintf(3, "size of linesEnd = %lu\n", (unsigned long) linesEnd.size()); + bool info2 = module->getSourceLines(last_addr, linesEnd); + verbprintf(4, "size of linesEnd = %lu\n", (unsigned long) linesEnd.size()); if(info2) { - auto row2 = linesEnd[0].lineNumber(); - auto col2 = linesEnd[0].lineOffset(); + for(auto& itr : linesEnd) + { + row2 = std::max(row2, itr.lineNumber()); + col2 = std::max(col2, itr.lineOffset()); + } if(col2 < 0) col2 = 0; if(row2 < row1) row1 = row2; // Fix for wrong line numbers @@ -627,35 +646,30 @@ get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* // We create a new name that embeds the file and line information in the name // function_signature -get_func_file_line_info(module_t* mutatee_module, procedure_t* f) +get_func_file_line_info(module_t* module, procedure_t* func) { - bool info1, info2; - unsigned long baseAddr, lastAddr; - char fname[FUNCNAMELEN + 1]; - char mname[FUNCNAMELEN + 1]; - int row1, col1, row2, col2; - string_t filename = {}; - string_t typeName = {}; + using address_t = Dyninst::Address; + + char fname[FUNCNAMELEN + 1]; + char mname[FUNCNAMELEN + 1]; + string_t typeName = {}; memset(fname, '\0', FUNCNAMELEN + 1); memset(mname, '\0', FUNCNAMELEN + 1); - mutatee_module->getName(mname, FUNCNAMELEN); + module->getName(mname, FUNCNAMELEN); + func->getName(fname, FUNCNAMELEN); - baseAddr = (unsigned long) (f->getBaseAddr()); - f->getAddressRange(baseAddr, lastAddr); - bpvector_t lines; - f->getName(fname, FUNCNAMELEN); + address_t base_addr{}; + address_t last_addr{}; + func->getAddressRange(base_addr, last_addr); - auto* returnType = f->getReturnType(); + auto* returnType = func->getReturnType(); - if(returnType) - { - typeName = returnType->getName(); - } + if(returnType) typeName = returnType->getName(); - auto* params = f->getParams(); - std::vector _params; + auto* params = func->getParams(); + std::vector _params = {}; if(params) { for(auto* itr : *params) @@ -666,32 +680,16 @@ get_func_file_line_info(module_t* mutatee_module, procedure_t* f) } } - info1 = mutatee_module->getSourceLines((unsigned long) baseAddr, lines); + bpvector_t lines = {}; + bool info = module->getSourceLines(base_addr, lines); - filename = mname; + string_t filename = mname; - if(info1) + if(info && !lines.empty()) { - // filename = lines[0].fileName(); - row1 = lines[0].lineNumber(); - col1 = lines[0].lineOffset(); - - if(col1 < 0) col1 = 0; - info2 = mutatee_module->getSourceLines((unsigned long) (lastAddr - 1), lines); - if(info2) - { - row2 = lines[1].lineNumber(); - col2 = lines[1].lineOffset(); - if(col2 < 0) col2 = 0; - if(row2 < row1) row1 = row2; - return function_signature(typeName, fname, filename, _params, { row1, 0 }, - { 0, 0 }, false, info1, info2); - } - else - { - return function_signature(typeName, fname, filename, _params, { row1, 0 }, - { 0, 0 }, false, info1, info2); - } + auto row = lines.front().lineNumber(); + return function_signature(typeName, fname, filename, _params, { row, 0 }, + { 0, 0 }, false, info, false); } else { diff --git a/source/bin/omnitrace/function_signature.cpp b/source/bin/omnitrace/function_signature.cpp new file mode 100644 index 0000000000..d6cae8bfae --- /dev/null +++ b/source/bin/omnitrace/function_signature.cpp @@ -0,0 +1,99 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "function_signature.hpp" + +function_signature::function_signature(string_t _ret, const string_t& _name, + string_t _file, location_t _row, location_t _col, + bool _loop, bool _info_beg, bool _info_end) +: m_loop(_loop) +, m_info_beg(_info_beg) +, m_info_end(_info_end) +, m_row(std::move(_row)) +, m_col(std::move(_col)) +, m_return(std::move(_ret)) +, m_name(tim::demangle(_name)) +, m_file(std::move(_file)) +{ + if(m_file.find('/') != string_t::npos) + m_file = m_file.substr(m_file.find_last_of('/') + 1); +} + +function_signature::function_signature(const string_t& _ret, const string_t& _name, + const string_t& _file, + const std::vector& _params, + location_t _row, location_t _col, bool _loop, + bool _info_beg, bool _info_end) +: function_signature(_ret, _name, _file, _row, _col, _loop, _info_beg, _info_end) +{ + m_params = "("; + for(const auto& itr : _params) + m_params.append(itr + ", "); + if(!_params.empty()) m_params = m_params.substr(0, m_params.length() - 2); + m_params += ")"; +} + +string_t +function_signature::get(function_signature& sig) +{ + return sig.get(); +} + +string_t +function_signature::get() const +{ + std::stringstream ss; + if(use_return_info && !m_return.empty()) ss << m_return << " "; + ss << m_name; + if(use_args_info) ss << m_params; + if(m_loop && m_info_beg) + { + auto _row_col_str = [](unsigned long _row, unsigned long _col) { + std::stringstream _ss{}; + if(_row == 0 && _col == 0) return std::string{}; + if(_col > 0) + _ss << "{" << _row << "," << _col << "}"; + else + _ss << "{" << _row << "}"; + return _ss.str(); + }; + + auto _rc1 = _row_col_str(m_row.first, m_col.first); + auto _rc2 = _row_col_str(m_row.second, m_col.second); + if(m_info_end && !_rc1.empty() && !_rc2.empty() && _rc1 != _rc2) + ss << " [" << _rc1 << "-" << _rc2 << "]"; + else if(m_info_end && !_rc1.empty() && !_rc2.empty() && _rc1 == _rc2) + ss << " [" << _rc1 << "]"; + else if(m_info_end && !_rc1.empty() && _rc2.empty()) + ss << " [" << _rc1 << "]"; + else if(!m_info_end && !_rc1.empty()) + ss << " [" << _rc1 << "]"; + else + errprintf(1, "loop line info is empty!"); + } + if(use_file_info && m_file.length() > 0) ss << " [" << m_file; + if(use_line_info && m_row.first > 0) ss << ":" << m_row.first; + if(use_file_info && m_file.length() > 0) ss << "]"; + + m_signature = ss.str(); + return m_signature; +} diff --git a/source/bin/omnitrace/function_signature.hpp b/source/bin/omnitrace/function_signature.hpp new file mode 100644 index 0000000000..dc6ea7bbd7 --- /dev/null +++ b/source/bin/omnitrace/function_signature.hpp @@ -0,0 +1,74 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "fwd.hpp" + +struct function_signature +{ + using location_t = std::pair; + + TIMEMORY_DEFAULT_OBJECT(function_signature) + + function_signature(string_t _ret, const string_t& _name, string_t _file, + location_t _row = { 0, 0 }, location_t _col = { 0, 0 }, + bool _loop = false, bool _info_beg = false, + bool _info_end = false); + + function_signature(const string_t& _ret, const string_t& _name, const string_t& _file, + const std::vector& _params, location_t _row = { 0, 0 }, + location_t _col = { 0, 0 }, bool _loop = false, + bool _info_beg = false, bool _info_end = false); + + static string_t get(function_signature& sig); + string_t get() const; + + bool m_loop = false; + bool m_info_beg = false; + bool m_info_end = false; + location_t m_row = { 0, 0 }; + location_t m_col = { 0, 0 }; + string_t m_return = {}; + string_t m_name = {}; + string_t m_params = "()"; + string_t m_file = {}; + mutable string_t m_signature = {}; + + friend bool operator==(const function_signature& lhs, const function_signature& rhs) + { + return lhs.get() == rhs.get(); + } + + template + void serialize(ArchiveT& _ar, const unsigned) + { + namespace cereal = tim::cereal; + (void) get(); + _ar(cereal::make_nvp("loop", m_loop), cereal::make_nvp("info_beg", m_info_beg), + cereal::make_nvp("info_end", m_info_end), cereal::make_nvp("row", m_row), + cereal::make_nvp("col", m_col), cereal::make_nvp("return", m_return), + cereal::make_nvp("name", m_name), cereal::make_nvp("params", m_params), + cereal::make_nvp("file", m_file), cereal::make_nvp("signature", m_signature)); + (void) get(); + } +}; diff --git a/source/bin/omnitrace/fwd.hpp b/source/bin/omnitrace/fwd.hpp new file mode 100644 index 0000000000..58785515db --- /dev/null +++ b/source/bin/omnitrace/fwd.hpp @@ -0,0 +1,273 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MUTNAMELEN 1024 +#define FUNCNAMELEN 32 * 1024 +#define NO_ERROR -1 +#define TIMEMORY_BIN_DIR "bin" + +#if !defined(PATH_MAX) +# define PATH_MAX std::numeric_limits::max(); +#endif + +struct function_signature; +struct module_function; + +template +using bpvector_t = BPatch_Vector; + +using string_t = std::string; +using string_view_t = std::string_view; +using stringstream_t = std::stringstream; +using strvec_t = std::vector; +using strset_t = std::set; +using regexvec_t = std::vector; +using fmodset_t = std::set; +using fixed_modset_t = std::map; +using exec_callback_t = BPatchExecCallback; +using exit_callback_t = BPatchExitCallback; +using fork_callback_t = BPatchForkCallback; +using patch_t = BPatch; +using process_t = BPatch_process; +using thread_t = BPatch_thread; +using binary_edit_t = BPatch_binaryEdit; +using image_t = BPatch_image; +using module_t = BPatch_module; +using procedure_t = BPatch_function; +using snippet_t = BPatch_snippet; +using call_expr_t = BPatch_funcCallExpr; +using address_space_t = BPatch_addressSpace; +using flow_graph_t = BPatch_flowGraph; +using basic_block_t = BPatch_basicBlock; +using basic_loop_t = BPatch_basicBlockLoop; +using procedure_loc_t = BPatch_procedureLocation; +using point_t = BPatch_point; +using local_var_t = BPatch_localVar; +using const_expr_t = BPatch_constExpr; +using error_level_t = BPatchErrorLevel; +using patch_pointer_t = std::shared_ptr; +using snippet_pointer_t = std::shared_ptr; +using call_expr_pointer_t = std::shared_ptr; +using snippet_vec_t = bpvector_t; +using procedure_vec_t = bpvector_t; +using basic_block_set_t = std::set; +using basic_loop_vec_t = bpvector_t; +using snippet_pointer_vec_t = std::vector; +using instruction_t = Dyninst::InstructionAPI::Instruction; + +void +omnitrace_prefork_callback(thread_t* parent, thread_t* child); + +//======================================================================================// +// +// Global Variables +// +//======================================================================================// +// +// label settings +// +extern bool use_return_info; +extern bool use_args_info; +extern bool use_file_info; +extern bool use_line_info; +// +// heuristic settings +// +extern bool allow_overlapping; +extern bool loop_level_instr; +extern bool instr_dynamic_callsites; +extern bool instr_traps; +extern bool instr_loop_traps; +extern size_t min_address_range; +extern size_t min_loop_address_range; +extern size_t min_instructions; +// +// debug settings +// +extern bool werror; +extern bool debug_print; +extern int verbose_level; +// +// string settings +// +extern string_t main_fname; +extern string_t argv0; +extern string_t cmdv0; +extern string_t default_components; +extern string_t prefer_library; +// +// global variables +// +extern patch_pointer_t bpatch; +extern call_expr_t* terminate_expr; +extern snippet_vec_t init_names; +extern snippet_vec_t fini_names; +extern fmodset_t available_module_functions; +extern fmodset_t instrumented_module_functions; +extern fmodset_t overlapping_module_functions; +extern fmodset_t excluded_module_functions; +extern fixed_modset_t fixed_module_functions; +extern regexvec_t func_include; +extern regexvec_t func_exclude; +extern regexvec_t file_include; +extern regexvec_t file_exclude; +extern regexvec_t file_restrict; +extern regexvec_t func_restrict; +// +//======================================================================================// + +// control debug printf statements +#define errprintf(LEVEL, ...) \ + { \ + if(werror || LEVEL < 0) \ + { \ + if(debug_print || verbose_level >= LEVEL) \ + fprintf(stderr, "[omnitrace][exe] Error! " __VA_ARGS__); \ + char _buff[FUNCNAMELEN]; \ + sprintf(_buff, "[omnitrace][exe] Error! " __VA_ARGS__); \ + throw std::runtime_error(std::string{ _buff }); \ + } \ + else \ + { \ + if(debug_print || verbose_level >= LEVEL) \ + fprintf(stderr, "[omnitrace][exe] Warning! " __VA_ARGS__); \ + } \ + fflush(stderr); \ + } + +// control verbose printf statements +#define verbprintf(LEVEL, ...) \ + { \ + if(debug_print || verbose_level >= LEVEL) \ + fprintf(stdout, "[omnitrace][exe] " __VA_ARGS__); \ + fflush(stdout); \ + } + +#define verbprintf_bare(LEVEL, ...) \ + { \ + if(debug_print || verbose_level >= LEVEL) fprintf(stdout, __VA_ARGS__); \ + fflush(stdout); \ + } + +//======================================================================================// + +template +void +consume_parameters(T&&...) +{} + +//======================================================================================// + +extern "C" +{ + bool are_file_include_exclude_lists_empty(); + bool instrument_module(const string_t& file_name); + bool instrument_entity(const string_t& function_name); + bool module_constraint(string_view_t fname); + bool routine_constraint(string_view_t fname); +} + +//======================================================================================// + +strset_t +get_whole_function_names(); + +function_signature +get_func_file_line_info(module_t* mutatee_module, procedure_t* f); + +function_signature +get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* cfGraph, + basic_loop_t* loopToInstrument); + +std::tuple +query_instr(procedure_t* funcToInstr, procedure_loc_t traceLoc, + flow_graph_t* cfGraph = nullptr, basic_loop_t* loopToInstrument = nullptr); + +bool +query_instr(procedure_t* funcToInstr, procedure_loc_t traceLoc, flow_graph_t* cfGraph, + basic_loop_t* loopToInstrument, bool allow_traps); + +template +bool +insert_instr(address_space_t* mutatee, procedure_t* funcToInstr, Tp traceFunc, + procedure_loc_t traceLoc, flow_graph_t* cfGraph = nullptr, + basic_loop_t* loopToInstrument = nullptr, bool allow_traps = true); + +void +errorFunc(error_level_t level, int num, const char** params); + +procedure_t* +find_function(image_t* appImage, const string_t& functionName, const strset_t& = {}); + +void +error_func_real(error_level_t level, int num, const char* const* params); + +void +error_func_fake(error_level_t level, int num, const char* const* params); diff --git a/source/bin/omnitrace/info.hpp b/source/bin/omnitrace/info.hpp new file mode 100644 index 0000000000..723cff24ef --- /dev/null +++ b/source/bin/omnitrace/info.hpp @@ -0,0 +1,251 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "fwd.hpp" +#include "module_function.hpp" + +static inline void +dump_info(std::ostream& _os, const fmodset_t& _data) +{ + module_function::reset_width(); + for(const auto& itr : _data) + module_function::update_width(itr); + + module_function::write_header(_os); + for(const auto& itr : _data) + _os << itr << '\n'; + + module_function::reset_width(); +} +// +template ::value, int> = 0> +static inline void +dump_info(ArchiveT& _ar, const fmodset_t& _data) +{ + _ar(tim::cereal::make_nvp("module_functions", _data)); +} +// +static inline void +dump_info(const string_t& _label, string_t _oname, const string_t& _ext, + const fmodset_t& _data, int _level, bool _fail) +{ + namespace cereal = tim::cereal; + namespace policy = tim::policy; + + _oname += "." + _ext; + auto _handle_error = [&]() { + std::stringstream _msg{}; + _msg << "[dump_info] Error opening '" << _oname << " for output"; + verbprintf(_level, "%s\n", _msg.str().c_str()); + if(_fail) + throw std::runtime_error(std::string{ "[omnitrace][exe]" } + _msg.str()); + }; + + if(!debug_print && verbose_level < _level) return; + + if(_ext == "txt") + { + std::ofstream ofs{}; + if(!tim::filepath::open(ofs, _oname)) + _handle_error(); + else + { + verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); + dump_info(ofs, _data); + verbprintf_bare(_level, "Done\n"); + } + ofs.close(); + } + else if(_ext == "xml") + { + std::stringstream oss{}; + { + using output_policy = policy::output_archive; + output_policy::indent() = true; + auto ar = output_policy::get(oss); + + ar->setNextName("omnitrace"); + ar->startNode(); + ar->setNextName(_label.c_str()); + ar->startNode(); + (*ar)(cereal::make_nvp("module_functions", _data)); + ar->finishNode(); + ar->finishNode(); + } + + std::ofstream ofs{}; + if(!tim::filepath::open(ofs, _oname)) + _handle_error(); + else + { + verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); + ofs << oss.str() << std::endl; + verbprintf_bare(_level, "Done\n"); + } + ofs.close(); + } + else if(_ext == "json") + { + std::stringstream oss{}; + { + using output_policy = policy::output_archive; + auto ar = output_policy::get(oss); + + ar->setNextName("omnitrace"); + ar->startNode(); + ar->setNextName(_label.c_str()); + ar->startNode(); + (*ar)(cereal::make_nvp("module_functions", _data)); + ar->finishNode(); + ar->finishNode(); + } + + std::ofstream ofs{}; + if(!tim::filepath::open(ofs, _oname)) + _handle_error(); + else + { + verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); + ofs << oss.str() << std::endl; + verbprintf_bare(_level, "Done\n"); + } + ofs.close(); + } + else + { + throw std::runtime_error(TIMEMORY_JOIN( + "", "[omnitrace][exe] Error in ", __FUNCTION__, " :: filename '", _oname, + "' does not have one of recognized file extensions: txt, json, xml")); + } +} +// +static inline void +dump_info(const string_t& _oname, const fmodset_t& _data, int _level, bool _fail, + const string_t& _type, const strset_t& _ext) +{ + for(const auto& itr : _ext) + dump_info(_type, _oname, itr, _data, _level, _fail); +} +// +static inline void +load_info(const string_t& _label, const string_t& _iname, fmodset_t& _data, int _level) +{ + namespace cereal = tim::cereal; + namespace policy = tim::policy; + + auto _pos = _iname.find_last_of('.'); + std::string _ext = {}; + if(_pos != std::string::npos) _ext = _iname.substr(_pos + 1, _iname.length()); + + auto _handle_error = [&]() { + std::stringstream _msg{}; + _msg << "[load_info] Error opening '" << _iname << " for input"; + verbprintf(_level, "%s\n", _msg.str().c_str()); + throw std::runtime_error(std::string{ "[omnitrace][exe]" } + _msg.str()); + }; + + if(_ext == "xml") + { + verbprintf(_level, "Reading '%s'... ", _iname.c_str()); + std::ifstream ifs{ _iname }; + if(!ifs) + _handle_error(); + else + { + using input_policy = policy::input_archive; + auto ar = input_policy::get(ifs); + + ar->setNextName("omnitrace"); + ar->startNode(); + ar->setNextName(_label.c_str()); + ar->startNode(); + (*ar)(cereal::make_nvp("module_functions", _data)); + ar->finishNode(); + ar->finishNode(); + } + verbprintf_bare(_level, "Done\n"); + ifs.close(); + } + else if(_ext == "json") + { + verbprintf(_level, "Reading '%s'... ", _iname.c_str()); + std::ifstream ifs{ _iname }; + if(!ifs) + _handle_error(); + else + { + using input_policy = policy::input_archive; + auto ar = input_policy::get(ifs); + + ar->setNextName("omnitrace"); + ar->startNode(); + ar->setNextName(_label.c_str()); + ar->startNode(); + (*ar)(cereal::make_nvp("module_functions", _data)); + ar->finishNode(); + ar->finishNode(); + } + verbprintf_bare(_level, "Done\n"); + ifs.close(); + } + else + { + throw std::runtime_error(TIMEMORY_JOIN( + "", "[omnitrace][exe] Error in ", __FUNCTION__, " :: filename '", _iname, + "' does not have one of recognized extentions: txt, json, xml :: ", _ext)); + } +} +// +static inline void +load_info(const string_t& _inp, std::map& _data, int _level) +{ + std::vector _exceptions{}; + _exceptions.reserve(_data.size()); + for(auto& itr : _data) + { + try + { + fmodset_t _tmp{}; + load_info(itr.first, _inp, _tmp, _level); + // add to the existing + itr.second->insert(_tmp.begin(), _tmp.end()); + // if it did not throw it was successfully loaded + _exceptions.clear(); + break; + } catch(std::exception& _e) + { + _exceptions.emplace_back(_e.what()); + } + } + if(!_exceptions.empty()) + { + std::stringstream _msg{}; + for(auto& itr : _exceptions) + { + _msg << "[omnitrace][exe] " << itr << "\n"; + } + throw std::runtime_error(_msg.str()); + } +} diff --git a/source/bin/omnitrace/module_function.cpp b/source/bin/omnitrace/module_function.cpp new file mode 100644 index 0000000000..ca78064b38 --- /dev/null +++ b/source/bin/omnitrace/module_function.cpp @@ -0,0 +1,511 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "module_function.hpp" +#include "fwd.hpp" +#include "omnitrace.hpp" + +module_function::width_t& +module_function::get_width() +{ + static width_t _instance = []() { + width_t _tmp; + _tmp.fill(0); + return _tmp; + }(); + return _instance; +} + +void +module_function::reset_width() +{ + get_width().fill(0); +} + +void +module_function::update_width(const module_function& rhs) +{ + get_width()[0] = std::max(get_width()[0], rhs.module_name.length()); + get_width()[1] = std::max(get_width()[1], rhs.function_name.length()); + get_width()[2] = std::max(get_width()[2], rhs.signature.get().length()); +} + +module_function::module_function(module_t* mod, procedure_t* proc) +: module{ mod } +, function{ proc } +, flow_graph{ proc->getCFG() } +{ + if(flow_graph) + { + flow_graph->getAllBasicBlocks(basic_blocks); + flow_graph->getOuterLoops(loop_blocks); + } + + for(const auto& itr : basic_blocks) + { + std::vector instructions{}; + itr->getInstructions(instructions); + num_instructions += instructions.size(); + } + + char modname[FUNCNAMELEN]; + char fname[FUNCNAMELEN]; + module->getFullName(modname, FUNCNAMELEN); + function->getName(fname, FUNCNAMELEN); + module_name = modname; + function_name = fname; + signature = get_func_file_line_info(module, function); + + if(!function->isInstrumentable()) + { + verbprintf(0, + "Warning! module function generated for un-instrumentable " + "function: %s [%s]\n", + function_name.c_str(), module_name.c_str()); + } + std::pair _range{}; + if(function->getAddressRange(_range.first, _range.second)) + address_range = _range.second - _range.first; +} + +void +module_function::write_header(std::ostream& os) +{ + auto w0 = std::min(get_width()[0], absolute_max_width); + auto w1 = std::min(get_width()[1], absolute_max_width); + auto w2 = std::min(get_width()[2], absolute_max_width); + + std::stringstream ss; + ss << std::setw(14) << "AddressRange" + << " " << std::setw(14) << "#Instructions" + << " " << std::setw(6) << "Ratio" + << " " << std::setw(w0 + 8) << std::left << "Module" + << " " << std::setw(w1 + 8) << std::left << "Function" + << " " << std::setw(w2 + 8) << std::left << "FunctionSignature" + << "\n"; + os << ss.str(); +} + +bool +module_function::should_instrument() const +{ + // hard constraints + if(!is_instrumentable()) return false; + if(!can_instrument_entry()) return false; + if(!can_instrument_exit()) return false; + if(is_module_constrained()) return false; + if(is_routine_constrained()) return false; + + // should be before user selection + constexpr int absolute_min_instructions = 4; + if(num_instructions < absolute_min_instructions) + { + messages.emplace_back( + 2, "Skipping", "function", + TIMEMORY_JOIN("-", "less-than", absolute_min_instructions, "instructions")); + return false; + } + + // user selection + if(is_user_excluded()) return false; + if(is_user_restricted()) return true; + if(is_user_included()) return true; + + // should be applied before dynamic-callsite check + if(is_overlapping_constrained()) return false; + if(is_entry_trap_constrained()) return false; + if(is_exit_trap_constrained()) return false; + + // needs to be applied before address range and number of instruction constraints + if(is_dynamic_callsite_forced()) return true; + + if(is_address_range_constrained()) return false; + if(is_num_instructions_constrained()) return false; + + return true; +} + +bool +module_function::is_instrumentable() const +{ + if(!function->isInstrumentable()) + { + messages.emplace_back(2, "Skipping", "module", "not-instrumentable"); + return false; + } + + return true; +} + +namespace +{ +bool +check_regex_restrictions(const std::string& _name, const regexvec_t& _regexes) +{ + // NOLINTNEXTLINE + for(auto& itr : _regexes) + if(std::regex_search(_name, itr)) return true; + return false; +} +} // namespace + +bool +module_function::is_user_restricted() const +{ + if(!file_restrict.empty()) + { + if(check_regex_restrictions(module_name, file_restrict)) + { + messages.emplace_back(2, "Forcing", "module", "module-restrict-regex"); + return false; + } + else + { + messages.emplace_back(3, "Skipping", "module", "module-restrict-regex"); + return true; + } + } + + if(!func_restrict.empty()) + { + if(check_regex_restrictions(module_name, func_restrict)) + { + messages.emplace_back(2, "Forcing", "function", "function-restrict-regex"); + return false; + } + else if(check_regex_restrictions(signature.get(), func_restrict)) + { + messages.emplace_back(2, "Forcing", "function", "function-restrict-regex"); + return false; + } + else + { + messages.emplace_back(3, "Skipping", "function", "function-restrict-regex"); + return true; + } + } + + return false; +} + +bool +module_function::is_user_included() const +{ + if(!file_include.empty()) + { + if(check_regex_restrictions(module_name, file_include)) + { + messages.emplace_back(2, "Forcing", "module", "module-include-regex"); + return true; + } + } + + if(!func_include.empty()) + { + if(check_regex_restrictions(function_name, func_include)) + { + messages.emplace_back(2, "Forcing", "function", "function-include-regex"); + return true; + } + else if(check_regex_restrictions(signature.get(), func_include)) + { + messages.emplace_back(2, "Forcing", "function", "function-include-regex"); + return true; + } + } + + return false; +} + +bool +module_function::is_user_excluded() const +{ + if(!file_exclude.empty()) + { + if(check_regex_restrictions(module_name, file_exclude)) + { + messages.emplace_back(2, "Skipping", "module", "module-exclude-regex"); + return true; + } + } + + if(!func_exclude.empty()) + { + if(check_regex_restrictions(function_name, func_exclude)) + { + messages.emplace_back(2, "Skipping", "function", "function-exclude-regex"); + return true; + } + else if(check_regex_restrictions(signature.get(), func_exclude)) + { + messages.emplace_back(2, "Skipping", "function", "function-exclude-regex"); + return true; + } + } + + return false; +} + +bool +module_function::is_overlapping() const +{ + procedure_vec_t _overlapping{}; + return function->findOverlapping(_overlapping); +} + +bool +module_function::is_module_constrained() const +{ + if(!instrument_module(module_name) || module_constraint(module_name.c_str())) + { + messages.emplace_back(2, "Skipping", "module", "module-constraint"); + return true; + } + return false; +} + +bool +module_function::is_routine_constrained() const +{ + if(!instrument_entity(function_name) || !instrument_entity(signature.get()) || + routine_constraint(function_name) || routine_constraint(signature.get())) + { + messages.emplace_back(2, "Skipping", "function", "function-constraint"); + return true; + } + return false; +} + +bool +module_function::is_overlapping_constrained() const +{ + if(!allow_overlapping && is_overlapping()) + { + messages.emplace_back(2, "Skipping", "function", "overlapping"); + return true; + } + + return false; +} + +bool +module_function::contains_dynamic_callsites() const +{ + if(flow_graph) return flow_graph->containsDynamicCallsites(); + + return false; +} + +bool +module_function::is_dynamic_callsite_forced() const +{ + if(instr_dynamic_callsites && contains_dynamic_callsites()) + { + messages.emplace_back(2, "Forcing", "function", "dynamic-callsites"); + return true; + } + + return false; +} + +bool +module_function::is_address_range_constrained() const +{ + if(!loop_blocks.empty()) return is_loop_address_range_constrained(); + + if(address_range < min_address_range) + { + messages.emplace_back(2, "Skipping", "function", "min-address-range"); + return true; + } + return false; +} + +bool +module_function::is_loop_address_range_constrained() const +{ + if(loop_blocks.empty()) return false; + + if(address_range < min_loop_address_range) + { + messages.emplace_back(2, "Skipping", "function", "min-address-range-loop"); + return true; + } + + return false; +} + +bool +module_function::is_num_instructions_constrained() const +{ + if(num_instructions < min_instructions) + { + messages.emplace_back(2, "Skipping", "function", "min-instructions"); + return true; + } + + return false; +} + +bool +module_function::can_instrument_entry() const +{ + size_t _num_points = 0; + size_t _num_traps = 0; + + std::tie(_num_points, _num_traps) = query_instr(function, BPatch_entry); + + if(_num_points == 0) + { + messages.emplace_back(3, "Skipping", "function", "no-instrumentable-entry-point"); + return false; + } + + return true; +} + +bool +module_function::can_instrument_exit() const +{ + size_t _num_points = 0; + size_t _num_traps = 0; + + std::tie(_num_points, _num_traps) = query_instr(function, BPatch_exit); + + if(_num_points == 0) + { + messages.emplace_back(3, "Skipping", "function", "no-instrumentable-exit-point"); + return false; + } + + return true; +} + +bool +module_function::is_entry_trap_constrained() const +{ + if(instr_traps) return false; + + size_t _num_points = 0; + size_t _num_traps = 0; + + std::tie(_num_points, _num_traps) = query_instr(function, BPatch_entry); + + if(!instr_traps && (_num_points - _num_traps) == 0) + { + messages.emplace_back(3, "Skipping", "function", + "entry-point-trap-instrumentation"); + return true; + } + + return false; +} + +bool +module_function::is_exit_trap_constrained() const +{ + if(instr_traps) return false; + + size_t _num_points = 0; + size_t _num_traps = 0; + + std::tie(_num_points, _num_traps) = query_instr(function, BPatch_exit); + + if((_num_points - _num_traps) == 0) + { + messages.emplace_back(3, "Skipping", "function", + "exit-point-trap-instrumentation"); + return true; + } + + return false; +} + +std::pair +module_function::operator()(address_space_t* _addr_space, procedure_t* _entr_trace, + procedure_t* _exit_trace) const +{ + std::pair _count = { 0, 0 }; + + auto _name = signature.get(); + auto _trace_entr = omnitrace_call_expr(_name.c_str()); + auto _trace_exit = omnitrace_call_expr(_name.c_str()); + auto _entr = _trace_entr.get(_entr_trace); + auto _exit = _trace_exit.get(_exit_trace); + + if(insert_instr(_addr_space, function, _entr, BPatch_entry, nullptr, nullptr, + instr_traps) && + insert_instr(_addr_space, function, _exit, BPatch_exit, nullptr, nullptr, + instr_traps)) + { + messages.emplace_back(1, "Instrumenting", "function", "no-constraint"); + ++_count.first; + } + + for(auto* itr : loop_blocks) + { + if(!loop_level_instr) continue; + + auto _is_constrained = [this](bool _v, const std::string& _label) { + if(!_v) + { + messages.emplace_back(3, "Skipping", "function", _label); + return true; + } + return false; + }; + + size_t _points = 0; + size_t _ntraps = 0; + std::tie(_points, _ntraps) = query_instr(function, BPatch_entry, flow_graph, itr); + + if(_is_constrained(_points == 0, "no-instrumentable-loop-entry-point")) continue; + if(_is_constrained(!instr_traps && (_points - _ntraps) == 0, + "loop-entry-point-trap-instrumentation")) + continue; + + std::tie(_points, _ntraps) = query_instr(function, BPatch_exit, flow_graph, itr); + + if(_is_constrained(_points == 0, "no-instrumentable-loop-exit-point")) continue; + if(_is_constrained(!instr_traps && (_points - _ntraps) == 0, + "loop-exit-point-trap-instrumentation")) + continue; + + auto lname = get_loop_file_line_info(module, function, flow_graph, itr); + auto _lname = lname.get(); + + messages.emplace_back(1, "Loop Instrumenting", "function", "no-constraint"); + ++_count.second; + + auto _ltrace_entr = omnitrace_call_expr(_lname.c_str()); + auto _ltrace_exit = omnitrace_call_expr(_lname.c_str()); + auto _lentr = _ltrace_entr.get(_entr_trace); + auto _lexit = _ltrace_exit.get(_exit_trace); + + insert_instr(_addr_space, function, _lentr, BPatch_entry, flow_graph, itr, + instr_loop_traps); + insert_instr(_addr_space, function, _lexit, BPatch_exit, flow_graph, itr, + instr_loop_traps); + } + + return _count; +} diff --git a/source/bin/omnitrace/module_function.hpp b/source/bin/omnitrace/module_function.hpp new file mode 100644 index 0000000000..39041c76a7 --- /dev/null +++ b/source/bin/omnitrace/module_function.hpp @@ -0,0 +1,185 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "function_signature.hpp" +#include "fwd.hpp" + +#include +#include + +struct module_function +{ + using width_t = std::array; + using address_t = Dyninst::Address; + + static constexpr size_t absolute_max_width = 80; + static width_t& get_width(); + static void reset_width(); + static void update_width(const module_function& rhs); + static void write_header(std::ostream& os); + + TIMEMORY_DEFAULT_OBJECT(module_function) + + module_function(module_t* mod, procedure_t* proc); + + std::pair operator()(address_space_t* _addr_space, + procedure_t* _entr_trace, + procedure_t* _exit_trace) const; + + // applies logic for all "is_*" and "can_*" checks below + bool should_instrument() const; + + // hard constraints + bool is_instrumentable() const; // checks whether can instrument + bool can_instrument_entry() const; // checks for entry points + bool can_instrument_exit() const; // checks for exit points + bool is_module_constrained() const; // checks module constraints + bool is_routine_constrained() const; // checks function constraints + + // user bypass of heuristics + bool is_user_restricted() const; // checks user restrict regexes + bool is_user_included() const; // checks user include regexes + bool is_user_excluded() const; // checks user exclude regexes + + // applied before dynamic-callsite constraint + bool is_overlapping_constrained() const; // checks overlapping constrains + bool is_entry_trap_constrained() const; // checks entry trap constraint + bool is_exit_trap_constrained() const; // checks exit trap constraint + + // applied before address range and # instruction constraints + bool is_dynamic_callsite_forced() const; // checks dynamic callsites + + // estimate the size/work of the function + bool is_address_range_constrained() const; // checks address range constraint + bool is_num_instructions_constrained() const; // check # instructions constraint + + uint64_t address_range = 0; + uint64_t num_instructions = 0; + module_t* module = nullptr; + procedure_t* function = nullptr; + flow_graph_t* flow_graph = nullptr; + string_t module_name = {}; + string_t function_name = {}; + function_signature signature = {}; + basic_block_set_t basic_blocks = {}; + basic_loop_vec_t loop_blocks = {}; + + using str_msg_t = std::tuple; + using str_msg_vec_t = std::vector; + + mutable str_msg_vec_t messages = {}; + + bool is_overlapping() const; // checks if func overlaps + +private: + bool is_loop_address_range_constrained() const; // checks loop addr range constraint + bool contains_dynamic_callsites() const; + +public: + template + void serialize(ArchiveT& ar, const unsigned); + + friend bool operator<(const module_function& lhs, const module_function& rhs) + { + return (lhs.module_name == rhs.module_name) + ? ((lhs.function_name == rhs.function_name) + ? (lhs.signature.get() < rhs.signature.get()) + : (lhs.function_name < rhs.function_name)) + : (lhs.module_name < rhs.module_name); + } + + friend bool operator==(const module_function& lhs, const module_function& rhs) + { + return std::tie(lhs.module_name, lhs.function_name, lhs.signature, + lhs.address_range, lhs.num_instructions) == + std::tie(rhs.module_name, rhs.function_name, rhs.signature, + rhs.address_range, rhs.num_instructions); + } + + friend std::ostream& operator<<(std::ostream& os, const module_function& rhs) + { + std::stringstream ss; + + auto w0 = std::min(get_width()[0], absolute_max_width); + auto w1 = std::min(get_width()[1], absolute_max_width); + auto w2 = std::min(get_width()[2], absolute_max_width); + + auto _get_str = [](const std::string& _inc) { + if(_inc.length() > absolute_max_width) + return _inc.substr(0, absolute_max_width - 3) + "..."; + return _inc; + }; + + // clang-format off + ss << std::setw(14) << rhs.address_range << " " + << std::setw(14) << rhs.num_instructions << " " + << std::setw(6) << std::setprecision(2) << std::fixed << (rhs.address_range / static_cast(rhs.num_instructions)) << " " + << std::setw(w0 + 8) << std::left << _get_str(rhs.module_name) << " " + << std::setw(w1 + 8) << std::left << _get_str(rhs.function_name) << " " + << std::setw(w2 + 8) << std::left << _get_str(rhs.signature.get()); + // clang-format on + + os << ss.str(); + return os; + } +}; + +template +void +module_function::serialize(ArchiveT& ar, const unsigned) +{ + namespace cereal = tim::cereal; + ar(cereal::make_nvp("address_range", address_range), + cereal::make_nvp("instructions", num_instructions), + cereal::make_nvp("module", module_name), + cereal::make_nvp("function", function_name), + cereal::make_nvp("signature", signature)); + + if constexpr(tim::concepts::is_output_archive::value) + { + ar.setNextName("heuristics"); + ar.startNode(); + ar(cereal::make_nvp("should_instrument", should_instrument()), + cereal::make_nvp("is_instrumentable", is_instrumentable()), + cereal::make_nvp("can_instrument_entry", can_instrument_entry()), + cereal::make_nvp("can_instrument_exit", can_instrument_exit()), + cereal::make_nvp("contains_dynamic_callsites", contains_dynamic_callsites()), + cereal::make_nvp("is_module_constrained", is_module_constrained()), + cereal::make_nvp("is_routine_constrained", is_routine_constrained()), + cereal::make_nvp("is_user_restricted", is_user_restricted()), + cereal::make_nvp("is_user_included", is_user_included()), + cereal::make_nvp("is_user_excluded", is_user_excluded()), + cereal::make_nvp("is_overlapping_constrained", is_overlapping_constrained()), + cereal::make_nvp("is_entry_trap_constrained", is_entry_trap_constrained()), + cereal::make_nvp("is_exit_trap_constrained", is_exit_trap_constrained()), + cereal::make_nvp("is_dynamic_callsite_forced", is_dynamic_callsite_forced()), + cereal::make_nvp("is_address_range_constrained", + is_address_range_constrained()), + cereal::make_nvp("is_loop_address_range_constrained", + is_loop_address_range_constrained()), + cereal::make_nvp("is_num_instructions_constrained", + is_num_instructions_constrained())); + ar.finishNode(); + } +} diff --git a/source/bin/omnitrace/omnitrace.cpp b/source/bin/omnitrace/omnitrace.cpp index 4bc4d5b951..1541f94d5f 100644 --- a/source/bin/omnitrace/omnitrace.cpp +++ b/source/bin/omnitrace/omnitrace.cpp @@ -21,9 +21,11 @@ // SOFTWARE. #include "omnitrace.hpp" +#include "fwd.hpp" #include #include +#include #include #include #include @@ -35,45 +37,73 @@ #include #include -bool debug_print = false; -int verbose_level = tim::get_env("TIMEMORY_RUN_VERBOSE", 0); +bool use_return_info = false; +bool use_args_info = false; +bool use_file_info = false; +bool use_line_info = false; +bool allow_overlapping = false; +bool loop_level_instr = false; +bool instr_dynamic_callsites = false; +bool instr_traps = false; +bool instr_loop_traps = false; +size_t min_address_range = (1 << 8); // 256 +size_t min_loop_address_range = (1 << 8); // 256 +size_t min_instructions = (1 << 6); // 64 +bool werror = false; +bool debug_print = false; +int verbose_level = tim::get_env("OMNITRACE_VERBOSE_INSTRUMENT", 0); +string_t main_fname = "main"; +string_t argv0 = {}; +string_t cmdv0 = {}; +string_t default_components = "wall_clock"; +string_t prefer_library = {}; +// +// global variables +// +patch_pointer_t bpatch = {}; +call_expr_t* terminate_expr = nullptr; +snippet_vec_t init_names = {}; +snippet_vec_t fini_names = {}; +fmodset_t available_module_functions = {}; +fmodset_t instrumented_module_functions = {}; +fmodset_t overlapping_module_functions = {}; +fmodset_t excluded_module_functions = {}; +fixed_modset_t fixed_module_functions = {}; +regexvec_t func_include = {}; +regexvec_t func_exclude = {}; +regexvec_t file_include = {}; +regexvec_t file_exclude = {}; +regexvec_t file_restrict = {}; +regexvec_t func_restrict = {}; namespace { -bool binary_rewrite = false; -bool is_attached = false; -bool loop_level_instr = false; -bool werror = false; -bool use_mpi = false; -bool is_static_exe = false; -bool is_driver = false; -bool allow_overlapping = false; -bool instr_dynamic_callsites = false; -bool instr_traps = false; -bool instr_loop_traps = false; -bool explicit_dump_and_exit = false; -size_t batch_size = 50; -strset_t extra_libs = {}; -size_t min_address_range = (1 << 8); // 256 -size_t min_loop_address_range = (1 << 8); // 256 -std::vector> hash_ids = {}; -std::map use_stubs = {}; -std::map beg_stubs = {}; -std::map end_stubs = {}; -strvec_t init_stub_names = {}; -strvec_t fini_stub_names = {}; -strset_t used_stub_names = {}; -std::vector env_variables = {}; -std::map beg_expr = {}; -std::map end_expr = {}; -const auto npos_v = string_t::npos; -string_t instr_mode = "trace"; -string_t print_instrumented = {}; -string_t print_excluded = {}; -string_t print_available = {}; -string_t print_overlapping = {}; -strset_t print_formats = { "txt", "json" }; -std::string modfunc_dump_dir = {}; +bool binary_rewrite = false; +bool is_attached = false; +bool use_mpi = false; +bool is_static_exe = false; +bool is_driver = false; +bool explicit_dump_and_exit = false; +size_t batch_size = 50; +strset_t extra_libs = {}; +std::vector> hash_ids = {}; +std::map use_stubs = {}; +std::map beg_stubs = {}; +std::map end_stubs = {}; +strvec_t init_stub_names = {}; +strvec_t fini_stub_names = {}; +strset_t used_stub_names = {}; +std::vector env_variables = {}; +std::map beg_expr = {}; +std::map end_expr = {}; +const auto npos_v = string_t::npos; +string_t instr_mode = "trace"; +string_t print_instrumented = {}; +string_t print_excluded = {}; +string_t print_available = {}; +string_t print_overlapping = {}; +strset_t print_formats = { "txt", "json" }; +std::string modfunc_dump_dir = {}; auto regex_opts = std::regex_constants::egrep | std::regex_constants::optimize; std::string @@ -554,6 +584,15 @@ main(int argc, char** argv) .dtype("boolean") .max_count(1) .action([](parser_t& p) { loop_level_instr = p.get("instrument-loops"); }); + parser + .add_argument({ "-i", "--min-instructions" }, + "If the number of instructions in a function is less than this " + "value, exclude it from instrumentation") + .count(1) + .dtype("int") + .set_default(min_instructions) + .action( + [](parser_t& p) { min_instructions = p.get("min-instructions"); }); parser .add_argument({ "-r", "--min-address-range" }, "If the address range of a function is less than this value, " @@ -886,7 +925,7 @@ main(int argc, char** argv) { if(!pitr->isInstrumentable()) continue; auto _modfn = module_function{ itr, pitr }; - module_names.insert(_modfn.module); + module_names.insert(_modfn.module_name); _insert_module_function(available_module_functions, _modfn); _add_overlapping(itr, pitr); } @@ -907,7 +946,7 @@ main(int argc, char** argv) if(mod && itr->isInstrumentable()) { auto _modfn = module_function{ mod, itr }; - module_names.insert(_modfn.module); + module_names.insert(_modfn.module_name); _insert_module_function(available_module_functions, _modfn); _add_overlapping(mod, itr); } @@ -1034,11 +1073,15 @@ main(int argc, char** argv) // //----------------------------------------------------------------------------------// - auto* _mutatee_init = find_function(app_image, "_init"); - auto* _mutatee_fini = find_function(app_image, "_fini"); - auto* main_func = find_function(app_image, main_fname.c_str()); - auto* mpi_init_func = find_function(app_image, "MPI_Init", { "MPI_Init_thread" }); - auto* mpi_fini_func = find_function(app_image, "MPI_Finalize"); + auto* _mutatee_init = find_function(app_image, "_init"); + auto* _mutatee_fini = find_function(app_image, "_fini"); + auto* main_func = find_function(app_image, main_fname.c_str()); + auto* mpi_init_func = find_function(app_image, "MPI_Init", { "MPI_Init_thread" }); + auto* mpi_fini_func = find_function(app_image, "MPI_Finalize"); + auto* user_start_func = find_function(app_image, "omnitrace_user_start_trace", + { "omnitrace_user_start_thread_trace" }); + auto* user_stop_func = find_function(app_image, "omnitrace_user_stop_trace", + { "omnitrace_user_stop_thread_trace" }); //----------------------------------------------------------------------------------// // @@ -1057,17 +1100,14 @@ main(int argc, char** argv) // //----------------------------------------------------------------------------------// - verbprintf(0, "Finding functions in image...\n"); + verbprintf(0, "Finding instrumentation functions...\n"); auto* entr_trace = find_function(app_image, "omnitrace_push_trace"); auto* exit_trace = find_function(app_image, "omnitrace_pop_trace"); - auto* entr_hash = find_function(app_image, "omnitrace_push_trace_hash"); - auto* exit_hash = find_function(app_image, "omnitrace_pop_trace_hash"); auto* init_func = find_function(app_image, "omnitrace_init"); auto* fini_func = find_function(app_image, "omnitrace_finalize"); auto* env_func = find_function(app_image, "omnitrace_set_env"); auto* mpi_func = find_function(app_image, "omnitrace_set_mpi"); - auto* hash_func = find_function(app_image, "omnitrace_add_hash_id"); if(!main_func && main_fname == "main") main_func = find_function(app_image, "_main"); @@ -1351,6 +1391,9 @@ main(int argc, char** argv) env_vars.emplace_back(TIMEMORY_JOIN('=', "HSA_TOOLS_LIB", _libname)); env_vars.emplace_back(TIMEMORY_JOIN('=', "OMNITRACE_MPI_INIT", "OFF")); env_vars.emplace_back(TIMEMORY_JOIN('=', "OMNITRACE_MPI_FINALIZE", "OFF")); + env_vars.emplace_back( + TIMEMORY_JOIN('=', "OMNITRACE_INIT_ENABLED", + (user_start_func && user_stop_func) ? "OFF" : "ON")); env_vars.emplace_back( TIMEMORY_JOIN('=', "OMNITRACE_TIMEMORY_COMPONENTS", default_components)); env_vars.emplace_back( @@ -1405,451 +1448,27 @@ main(int argc, char** argv) //----------------------------------------------------------------------------------// // - // Lambda for instrumenting procedures. The first pass (usage_pass = true) will - // generate the hash_ids for each string so that these can be inserted in bulk - // with one operation and do not have to be calculated during runtime. - // - //----------------------------------------------------------------------------------// - std::vector> instr_procedure_functions; - auto instr_procedures = [&](const procedure_vec_t& procedures) { - // - auto _report = [](int _lvl, const string_t& _action, const string_t& _type, - const string_t& _reason, const string_t& _name, - const std::string& _extra = {}) { - static std::map already_reported{}; - if(already_reported[_type].count(_name) == 0) - { - verbprintf(_lvl, "[%s][%s] %s :: '%s'", _type.c_str(), _action.c_str(), - _reason.c_str(), _name.c_str()); - if(!_extra.empty()) verbprintf_bare(_lvl, " (%s)", _extra.c_str()); - verbprintf_bare(_lvl, "...\n"); - already_reported[_type].insert(_name); - } - }; - - auto check_regex_restrictions = [](const std::string& _name, - const regexvec_t& _regexes) { - // NOLINTNEXTLINE - for(auto& itr : _regexes) - if(std::regex_search(_name, itr)) return true; - return false; - }; - - std::pair _count = { 0, 0 }; - for(auto* itr : procedures) - { - if(!itr) continue; - - char modname[FUNCNAMELEN]; - char fname[FUNCNAMELEN]; - - itr->getName(fname, FUNCNAMELEN); - module_t* mod = itr->getModule(); - if(mod) - mod->getFullName(modname, FUNCNAMELEN); - else - itr->getModuleName(modname, FUNCNAMELEN); - - if(!itr->isInstrumentable()) - { - _report(3, "Skipping", "function", "uninstrumentable", fname); - continue; - } - - auto name = get_func_file_line_info(mod, itr); - - if(itr == main_func || name.m_name == "main" || name.get() == main_sign.get()) - { - hash_ids.emplace_back(std::hash()(main_sign.get()), - main_sign.get()); - _insert_module_function( - available_module_functions, - module_function{ modname, fname, main_sign, itr }); - _insert_module_function( - instrumented_module_functions, - module_function{ modname, fname, main_sign, itr }); - continue; - } - - if(strlen(modname) == 0) - { - _report(3, "Skipping", "module", "empty name", modname); - continue; - } - - if(name.get().empty()) - { - _report(3, "Skipping", "function", "empty name", fname); - continue; - } - - // apply module and function restrictions - auto _force_inc = false; - - //--------------------------------------------------------------------------// - // - // RESTRICT REGEXES - // - //--------------------------------------------------------------------------// - if(!file_restrict.empty()) - { - if(check_regex_restrictions(modname, file_restrict)) - { - _report(2, "Forcing", "module", "module-restrict-regex", modname); - _force_inc = true; - } - else - { - _report(3, "Skipping", "module", "module-restrict-regex", modname); - continue; - } - } - - if(!func_restrict.empty()) - { - if(check_regex_restrictions(name.m_name, func_restrict)) - { - _report(2, "Forcing", "function", "function-restrict-regex", - name.m_name); - _force_inc = true; - } - else if(check_regex_restrictions(name.get(), func_restrict)) - { - _report(2, "Forcing", "function", "function-restrict-regex", - name.get()); - _force_inc = true; - } - else - { - _report(3, "Skipping", "function", "function-restrict-regex", - name.get()); - continue; - } - } - - //--------------------------------------------------------------------------// - // - // INCLUDE REGEXES - // - //--------------------------------------------------------------------------// - if(!file_include.empty()) - { - if(check_regex_restrictions(modname, file_include)) - { - _report(2, "Forcing", "module", "module-include-regex", modname); - _force_inc = true; - } - } - - if(!func_include.empty()) - { - if(check_regex_restrictions(name.m_name, func_include)) - { - _report(2, "Forcing", "function", "function-include-regex", - name.m_name); - _force_inc = true; - } - else if(check_regex_restrictions(name.get(), func_include)) - { - _report(2, "Forcing", "function", "function-include-regex", - name.get()); - _force_inc = true; - } - } - - //--------------------------------------------------------------------------// - // - // EXCLUDE REGEXES - // - //--------------------------------------------------------------------------// - if(!file_exclude.empty()) - { - if(check_regex_restrictions(modname, file_exclude)) - { - _report(2, "Skipping", "module", "module-exclude-regex", modname); - continue; - } - } - - if(!func_exclude.empty()) - { - if(check_regex_restrictions(name.m_name, func_exclude)) - { - _report(2, "Skipping", "function", "function-exclude-regex", - name.m_name); - continue; - } - else if(check_regex_restrictions(name.get(), func_exclude)) - { - _report(2, "Skipping", "function", "function-exclude-regex", - name.get()); - continue; - } - } - - // try to get loops via the control flow graph - flow_graph_t* cfg = itr->getCFG(); - basic_loop_vec_t basic_loop{}; - if(cfg) cfg->getOuterLoops(basic_loop); - - if(!_force_inc) - { - if(module_constraint(modname)) continue; - if(!instrument_module(modname)) continue; - - if(routine_constraint(name.m_name.c_str())) continue; - if(!instrument_entity(name.m_name)) continue; - - if(is_static_exe && has_debug_info && string_t{ fname } == "_fini" && - string_t{ modname } == "DEFAULT_MODULE") - { - _report(3, "Skipping", "function", "DEFAULT_MODULE", fname); - continue; - } - - _add_overlapping(mod, itr); - - if(!allow_overlapping && - overlapping_module_functions.find(module_function{ mod, itr }) != - overlapping_module_functions.end()) - { - _report(3, "Skipping", "function", "overlapping", fname); - continue; - } - - // directly try to get loop entry points - const std::vector* _loop_entries = - itr->findPoint(BPatch_locLoopEntry); - - // if the function has dynamic callsites and user specified instrumenting - // dynamic callsites, force the instrumentation - bool _force_instr = false; - if(cfg && instr_dynamic_callsites) - _force_instr = cfg->containsDynamicCallsites(); - - auto _address_range = module_function{ mod, itr }.address_range; - auto _num_loop_entries = - (_loop_entries) - ? std::max(_loop_entries->size(), basic_loop.size()) - : basic_loop.size(); - auto _has_loop_entries = (_num_loop_entries > 0); - auto _skip_range = - (_has_loop_entries) ? false : (_address_range < min_address_range); - auto _skip_loop_range = (_has_loop_entries) - ? (_address_range < min_loop_address_range) - : false; - - if(_force_instr && (_skip_range || _skip_loop_range)) - { - _report(2, "Forcing", "function", "dynamic-callsite", fname); - } - else if(_skip_range) - { - _report(2, "Skipping", "function", "min-address-range", fname, - TIMEMORY_JOIN('=', "range", _address_range)); - continue; - } - else if(_skip_loop_range) - { - _report(2, "Skipping", "function", "min-address-range-loop", fname, - TIMEMORY_JOIN('=', "range", _address_range)); - continue; - } - } - - bool _entr_success = - query_instr(itr, BPatch_entry, nullptr, nullptr, instr_traps); - bool _exit_success = - query_instr(itr, BPatch_exit, nullptr, nullptr, instr_traps); - if(!_entr_success && !_exit_success) - { - _report(3, "Skipping", "function", - "Either no entry " - "instrumentation points were found or instrumentation " - "required traps and instrumenting via traps were disabled.", - fname); - continue; - } - else if(_entr_success && !_exit_success) - { - std::stringstream _ss{}; - _ss << "Function can be only partially instrument (entry = " - << std::boolalpha << _entr_success << ", exit = " << _exit_success - << ")"; - _report(3, "Skipping", "function", _ss.str(), fname); - continue; - } - - hash_ids.emplace_back(std::hash()(name.get()), name.get()); - _insert_module_function(available_module_functions, - module_function{ mod, itr }); - _insert_module_function(instrumented_module_functions, - module_function{ mod, itr }); - - auto _f = [=]() { - static std::set _reported{}; - auto _hashv = - std::hash{}(TIMEMORY_JOIN('|', modname, name.m_name)); - if(!_reported.emplace(_hashv).second) - { - verbprintf(1, "Instrumenting |> [ %s ] -> [ %s ]\n", modname, - name.m_name.c_str()); - } - - auto _name = name.get(); - auto _hash = std::hash()(_name); - auto _trace_entr = (entr_hash) ? omnitrace_call_expr(_hash) - : omnitrace_call_expr(_name.c_str()); - auto _trace_exit = (exit_hash) ? omnitrace_call_expr(_hash) - : omnitrace_call_expr(_name.c_str()); - auto _entr = _trace_entr.get((entr_hash) ? entr_hash : entr_trace); - auto _exit = _trace_exit.get((exit_hash) ? exit_hash : exit_trace); - - insert_instr(addr_space, itr, _entr, BPatch_entry, nullptr, nullptr, - instr_traps); - insert_instr(addr_space, itr, _exit, BPatch_exit, nullptr, nullptr, - instr_traps); - }; - - instr_procedure_functions.emplace_back(_f); - ++_count.first; - - if(loop_level_instr) - { - verbprintf(3, "Instrumenting at the loop level: %s\n", - name.m_name.c_str()); - - for(auto* litr : basic_loop) - { - bool _lentr_success = - query_instr(itr, BPatch_entry, cfg, litr, instr_loop_traps); - bool _lexit_success = - query_instr(itr, BPatch_exit, cfg, litr, instr_loop_traps); - if(!_lentr_success && !_lexit_success) - { - _report( - 3, "Skipping", "function-loop", - "Either no entry instrumentation points were found or " - "instrumentation " - "required traps and instrumenting via traps were disabled.", - fname); - continue; - } - else if(_lentr_success && !_lexit_success) - { - std::stringstream _ss{}; - _ss << "Function can be only partially instrument (entry = " - << std::boolalpha << _lentr_success - << ", exit = " << _lexit_success << ")"; - _report(3, "Skipping", "function-loop", _ss.str(), fname); - continue; - } - - auto lname = get_loop_file_line_info(mod, itr, cfg, litr); - auto _lname = lname.get(); - auto _lhash = std::hash()(_lname); - hash_ids.emplace_back(_lhash, _lname); - auto _lf = [=]() { - static std::set _reported{}; - auto _hashv = std::hash{}( - TIMEMORY_JOIN('|', modname, name.m_name)); - if(!_reported.emplace(_hashv).second) - { - verbprintf(1, "Loop Instrumenting |> [ %s ] -> [ %s ]\n", - modname, name.m_name.c_str()); - } - auto _ltrace_entr = (entr_hash) - ? omnitrace_call_expr(_lhash) - : omnitrace_call_expr(_lname.c_str()); - auto _ltrace_exit = (exit_hash) - ? omnitrace_call_expr(_lhash) - : omnitrace_call_expr(_lname.c_str()); - auto _lentr = - _ltrace_entr.get((entr_hash) ? entr_hash : entr_trace); - auto _lexit = - _ltrace_exit.get((exit_hash) ? exit_hash : exit_trace); - - insert_instr(addr_space, itr, _lentr, BPatch_entry, cfg, litr, - instr_loop_traps); - insert_instr(addr_space, itr, _lexit, BPatch_exit, cfg, litr, - instr_loop_traps); - }; - instr_procedure_functions.emplace_back(_lf); - ++_count.second; - } - } - } - return _count; - }; - - //----------------------------------------------------------------------------------// - // - // Do a first pass through all procedures to generate the hash ids + // Sort the available module functions into appropriate containers // //----------------------------------------------------------------------------------// if(instr_mode == "trace") { - const int _verbose_lvl = 2; - verbprintf(2, "Beginning loop over modules [hash id generation pass]\n"); - std::vector>> _pass_info{}; - for(auto& m : modules) + for(const auto& itr : available_module_functions) { - char modname[1024]; - m->getName(modname, 1024); - if(strstr(modname, "libdyninst") != nullptr) continue; - - if(!m->getProcedures()) + if(itr.should_instrument()) { - verbprintf(_verbose_lvl, "Skipping module w/ no procedures: '%s'\n", - modname); - continue; - } - - verbprintf(_verbose_lvl + 1, "Parsing module: %s\n", modname); - bpvector_t* p = m->getProcedures(); - if(!p) continue; - - verbprintf(_verbose_lvl, "%4zu procedures are begin processed in %s\n", - p->size(), modname); - - auto _count = instr_procedures(*p); - - _pass_info.emplace_back(modname, _count); - } - // report the instrumented - for(auto& itr : _pass_info) - { - auto _valid = (verbose_level > _verbose_lvl || - (itr.second.first + itr.second.second) > 0); - if(_valid) - { - verbprintf(_verbose_lvl, "%4zu instrumented procedures in %s\n", - itr.second.first, itr.first.c_str()); - _valid = (loop_level_instr && - (verbose_level > _verbose_lvl || itr.second.second > 0)); - if(_valid) - { - verbprintf(_verbose_lvl, "%4zu instrumented loop procedures in %s\n", - itr.second.second, itr.first.c_str()); - } + if(itr.function != main_func && itr.function != _mutatee_init && + itr.function != _mutatee_fini) + _insert_module_function(instrumented_module_functions, itr); } + else + _insert_module_function(excluded_module_functions, itr); + if(itr.is_overlapping()) + _insert_module_function(overlapping_module_functions, itr); } } - //----------------------------------------------------------------------------------// - // - // Add the snippet that assign the hash ids - // - //----------------------------------------------------------------------------------// - - omnitrace_snippet_vec hash_snippet_vec{}; - // generate a call expression for each hash + key - for(auto& itr : hash_ids) - hash_snippet_vec.generate(hash_func, itr.first, itr.second.c_str()); - // append all the call expressions to init names - hash_snippet_vec.append(init_names); - //----------------------------------------------------------------------------------// // // Insert the initialization and finalization routines into the main entry and @@ -1892,12 +1511,57 @@ main(int argc, char** argv) addr_space->beginInsertionSet(); } - verbprintf(2, "Beginning loop over modules [instrumentation pass]\n"); + verbprintf(2, "Beginning instrumentation loop...\n"); verbprintf(1, "\n"); - for(auto& instr_procedure : instr_procedure_functions) - instr_procedure(); + std::map> _pass_info{}; + const int _pass_verbose_lvl = 2; + for(const auto& itr : instrumented_module_functions) + { + auto _count = itr(addr_space, entr_trace, exit_trace); + _pass_info[itr.module_name].first += _count.first; + _pass_info[itr.module_name].second += _count.second; + + auto _report = [](int _lvl, const string_t& _action, const string_t& _type, + const string_t& _reason, const string_t& _name, + const std::string& _extra = {}) { + static std::map already_reported{}; + auto _key = _type + _action + _reason; + if(already_reported[_key].count(_name) == 0) + { + verbprintf(_lvl, "[%s][%s] %s :: '%s'", _type.c_str(), _action.c_str(), + _reason.c_str(), _name.c_str()); + if(!_extra.empty()) verbprintf_bare(_lvl, " (%s)", _extra.c_str()); + verbprintf_bare(_lvl, "...\n"); + already_reported[_key].insert(_name); + } + }; + + for(const auto& mitr : itr.messages) + _report(std::get<0>(mitr), std::get<1>(mitr), std::get<2>(mitr), + std::get<3>(mitr), + std::get<2>(mitr) == "module" ? itr.module_name : itr.function_name); + } verbprintf(1, "\n"); + // report the instrumented + for(auto& itr : _pass_info) + { + auto _valid = (verbose_level > _pass_verbose_lvl || + (itr.second.first + itr.second.second) > 0); + if(_valid) + { + verbprintf(_pass_verbose_lvl, "%4zu instrumented procedures in %s\n", + itr.second.first, itr.first.c_str()); + _valid = (loop_level_instr && + (verbose_level > _pass_verbose_lvl || itr.second.second > 0)); + if(_valid) + { + verbprintf(_pass_verbose_lvl, "%4zu instrumented loop procedures in %s\n", + itr.second.second, itr.first.c_str()); + } + } + } + if(app_thread) { verbprintf(1, "Finalizing insertion set...\n"); @@ -1908,40 +1572,41 @@ main(int argc, char** argv) verbprintf( 1, "Using insertion set failed. Restarting with individual insertion...\n"); - auto _execute_batch = [&instr_procedure_functions, &addr_space](size_t _beg, - size_t _end) { + auto _execute_batch = [&addr_space, &entr_trace, &exit_trace](size_t _beg, + size_t _end) { verbprintf(1, "Instrumenting batch of functions [%lu, %lu)\n", (unsigned long) _beg, (unsigned long) _end); addr_space->beginInsertionSet(); - for(size_t i = _beg; i < _end; ++i) - { - if(i < instr_procedure_functions.size()) - instr_procedure_functions.at(i)(); - } + auto itr = instrumented_module_functions.begin(); + std::advance(itr, _beg); + for(size_t i = _beg; i < _end; ++i, ++itr) + (*itr)(addr_space, entr_trace, exit_trace); bool _modified = true; bool _success = addr_space->finalizeInsertionSet(true, &_modified); return _success; }; - auto execute_batch = [&_execute_batch, - &instr_procedure_functions](size_t _beg) { + auto execute_batch = [&_execute_batch, &addr_space, &entr_trace, + &exit_trace](size_t _beg) { if(!_execute_batch(_beg, _beg + batch_size)) { verbprintf(1, "Batch instrumentation of functions [%lu, %lu) failed. " "Beginning non-batched instrumentation for this set\n", (unsigned long) _beg, (unsigned long) _beg + batch_size); - for(size_t i = _beg; i < _beg + batch_size; ++i) + auto itr = instrumented_module_functions.begin(); + auto _end = instrumented_module_functions.end(); + std::advance(itr, _beg); + for(size_t i = _beg; i < _beg + batch_size && itr != _end; ++i, ++itr) { - if(i < instr_procedure_functions.size()) - instr_procedure_functions.at(i)(); + (*itr)(addr_space, entr_trace, exit_trace); } } return _beg + batch_size; }; size_t nidx = 0; - while(nidx < instr_procedure_functions.size()) + while(nidx < instrumented_module_functions.size()) { nidx = execute_batch(nidx); } @@ -1994,19 +1659,19 @@ main(int argc, char** argv) if(_mode == "modules") { for(const auto& itr : _modset) - _insert(itr.module, TIMEMORY_JOIN("", "[", itr.module, "]")); + _insert(itr.module_name, TIMEMORY_JOIN("", "[", itr.module_name, "]")); } else if(_mode == "functions") { for(const auto& itr : _modset) - _insert(itr.module, TIMEMORY_JOIN("", "[", itr.function, "][", - itr.address_range, "]")); + _insert(itr.module_name, TIMEMORY_JOIN("", "[", itr.function_name, "][", + itr.address_range, "]")); } else if(_mode == "functions+") { for(const auto& itr : _modset) - _insert(itr.module, TIMEMORY_JOIN("", "[", itr.signature.get(), "][", - itr.address_range, "]")); + _insert(itr.module_name, TIMEMORY_JOIN("", "[", itr.signature.get(), "][", + itr.address_range, "]")); } else if(_mode == "pair") { @@ -2014,9 +1679,9 @@ main(int argc, char** argv) { std::stringstream _ss{}; _ss << std::boolalpha; - _ss << "" << itr.module << "] --> [" << itr.function << "][" + _ss << "" << itr.module_name << "] --> [" << itr.function_name << "][" << itr.address_range << "]"; - _insert(itr.module, _ss.str()); + _insert(itr.module_name, _ss.str()); } } else if(_mode == "pair+") @@ -2025,9 +1690,9 @@ main(int argc, char** argv) { std::stringstream _ss{}; _ss << std::boolalpha; - _ss << "[" << itr.module << "] --> [" << itr.signature.get() << "][" + _ss << "[" << itr.module_name << "] --> [" << itr.signature.get() << "][" << itr.address_range << "]"; - _insert(itr.module, _ss.str()); + _insert(itr.module_name, _ss.str()); } } else @@ -2117,6 +1782,17 @@ main(int argc, char** argv) { verbprintf(0, "Executing...\n"); +#define WAITPID_DEBUG_MESSAGE(QUERY) \ + { \ + QUERY; \ + verbprintf(3, \ + "waitpid (%i, %i) returned [%s:%i] :: %s :: code: %i, status %i. " \ + "WIFEXITED(status) = " \ + "%i, WIFSIGNALED(status) = %i\n", \ + cpid, w, __FILE__, __LINE__, #QUERY, code, status, WIFEXITED(status), \ + WIFSIGNALED(status)); \ + } + if(!app_thread->isTerminated()) { pid_t cpid = app_thread->getPid(); @@ -2124,6 +1800,7 @@ main(int argc, char** argv) app_thread->detach(true); do { + status = 0; pid_t w = waitpid(cpid, &status, WUNTRACED); if(w == -1) { @@ -2133,21 +1810,21 @@ main(int argc, char** argv) if(WIFEXITED(status)) { - code = WEXITSTATUS(status); + WAITPID_DEBUG_MESSAGE(code = WEXITSTATUS(status)); } else if(WIFSIGNALED(status)) { - code = WTERMSIG(status); + WAITPID_DEBUG_MESSAGE(code = WTERMSIG(status)); } else if(WIFSTOPPED(status)) { - code = WSTOPSIG(status); + WAITPID_DEBUG_MESSAGE(code = WSTOPSIG(status)); } else if(WIFCONTINUED(status)) { - code = WIFCONTINUED(status); + WAITPID_DEBUG_MESSAGE(code = WIFCONTINUED(status)); } - } while(!WIFEXITED(status) && !WIFSIGNALED(status)); + } while(WIFEXITED(status) == 0 && WIFSIGNALED(status) == 0); } else { @@ -2258,7 +1935,6 @@ instrument_module(const string_t& file_name) //======================================================================================// -extern const strset_t exclude_function_names; bool instrument_entity(const string_t& function_name) { @@ -2284,7 +1960,8 @@ instrument_entity(const string_t& function_name) "virtual thunk|non-virtual thunk|transaction clone|" "RtsLayer|DYNINST|PthreadLayer|threaded_func|PMPI|" "Kokkos::Impl::|Kokkos::Experimental::Impl::|Kokkos::impl_|" - "Kokkos::[A-Za-z]+::impl_|Kokkos::Tools::|Kokkos::Profiling::)", + "Kokkos::[A-Za-z]+::impl_|Kokkos::Tools::|Kokkos::Profiling::|" + "kmp_threadprivate_)", regex_opts); static std::regex trailing("(\\.part\\.[0-9]+|\\.constprop\\.[0-9]+|\\.|\\.[0-9]+)$", regex_opts); @@ -2301,31 +1978,31 @@ instrument_entity(const string_t& function_name) if(std::regex_search(function_name, exclude) || std::regex_search(function_name, exclude_cxx)) { - _report("critical", function_name, 3); + _report("Excluding", "critical", 3); return false; } if(whole.count(function_name) > 0) { - _report("critical", function_name, 3); + _report("Excluding", "critical", 3); return false; } // don't instrument the functions when key is found at the start of the function name if(std::regex_search(function_name, leading)) { - _report("recommended", function_name, 3); + _report("Excluding", "recommended", 3); return false; } // don't instrument the functions when key is found at the end of the function name if(std::regex_search(function_name, trailing)) { - _report("recommended", function_name, 3); + _report("Excluding", "recommended", 3); return false; } - _report("Including function [no constraint] : '%s'...\n", function_name, 3); + _report("Including", "no constraint", 2); return true; } @@ -2369,20 +2046,14 @@ query_instr(procedure_t* funcToInstr, procedure_loc_t traceLoc, flow_graph_t* cf return (_n > 0); } -//======================================================================================// -// insert_instr -- generic insert instrumentation function -// -template -bool -insert_instr(address_space_t* mutatee, procedure_t* funcToInstr, Tp traceFunc, - procedure_loc_t traceLoc, flow_graph_t* cfGraph, - basic_loop_t* loopToInstrument, bool allow_traps) +std::tuple +query_instr(procedure_t* funcToInstr, procedure_loc_t traceLoc, flow_graph_t* cfGraph, + basic_loop_t* loopToInstrument) { module_t* module = funcToInstr->getModule(); - if(!module || !traceFunc) return false; + if(!module) return { 0, 0 }; bpvector_t* _points = nullptr; - auto _trace = traceFunc.get(); if(cfGraph && loopToInstrument) { @@ -2396,65 +2067,31 @@ insert_instr(address_space_t* mutatee, procedure_t* funcToInstr, Tp traceFunc, _points = funcToInstr->findPoint(traceLoc); } - if(_points == nullptr) return false; - if(_points->empty()) return false; - - /*if(loop_level_instr) - { - flow_graph_t* flow = funcToInstr->getCFG(); - bpvector_t basicLoop; - flow->getOuterLoops(basicLoop); - for(auto litr = basicLoop.begin(); litr != basicLoop.end(); ++litr) - { - bpvector_t* _tmp; - if(traceLoc == BPatch_entry) - _tmp = cfGraph->findLoopInstPoints(BPatch_locLoopEntry, *litr); - else if(traceLoc == BPatch_exit) - _tmp = cfGraph->findLoopInstPoints(BPatch_locLoopExit, *litr); - if(!_tmp) - continue; - for(auto& itr : *_tmp) - _points->push_back(itr); - } - }*/ - - // verbprintf(0, "Instrumenting |> [ %s ]\n", name.m_name.c_str()); - - std::set _traps{}; - if(!allow_traps) - { - for(auto& itr : *_points) - { - if(itr && itr->usesTrap_NP()) _traps.insert(itr); - } - } + if(_points == nullptr) return { 0, 0 }; + if(_points->empty()) return { 0, 0 }; size_t _n = 0; + size_t _t = 0; for(auto& itr : *_points) { - if(!itr || _traps.count(itr) > 0) - continue; - else if(traceLoc == BPatch_entry) - mutatee->insertSnippet(*_trace, *itr, BPatch_callBefore, BPatch_firstSnippet); - // else if(traceLoc == BPatch_exit) - // mutatee->insertSnippet(*_trace, *itr, BPatch_callAfter, - // BPatch_firstSnippet); - else - mutatee->insertSnippet(*_trace, *itr); - ++_n; + if(itr) + { + ++_n; + if(itr->usesTrap_NP()) ++_t; + } } - return (_n > 0); + return { _n, _t }; } //======================================================================================// // Constraints for instrumentation. Returns true for those modules that // shouldn't be instrumented. bool -module_constraint(char* fname) +module_constraint(string_view_t fname) { // fname is the name of module/file - string_t _fname = fname; + string_t _fname = string_t{ fname }; // never instrumentat any module matching omnitrace if(_fname.find("omnitrace") != string_t::npos) return true; @@ -2472,9 +2109,9 @@ module_constraint(char* fname) // Constraint for routines. The constraint returns true for those routines that // should not be instrumented. bool -routine_constraint(const char* fname) +routine_constraint(string_view_t fname) { - string_t _fname = fname; + string_t _fname = string_t{ fname }; if(_fname.find("omnitrace") != string_t::npos) return true; auto npos = std::string::npos; diff --git a/source/bin/omnitrace/omnitrace.hpp b/source/bin/omnitrace/omnitrace.hpp index e799ee0ad9..a5fc4a6959 100644 --- a/source/bin/omnitrace/omnitrace.hpp +++ b/source/bin/omnitrace/omnitrace.hpp @@ -22,231 +22,10 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MUTNAMELEN 1024 -#define FUNCNAMELEN 32 * 1024 -#define NO_ERROR -1 -#define TIMEMORY_BIN_DIR "bin" - -#if !defined(PATH_MAX) -# define PATH_MAX std::numeric_limits::max(); -#endif - -struct function_signature; -struct module_function; - -template -using bpvector_t = BPatch_Vector; - -using string_t = std::string; -using stringstream_t = std::stringstream; -using strvec_t = std::vector; -using strset_t = std::set; -using regexvec_t = std::vector; -using fmodset_t = std::set; -using fixed_modset_t = std::map; -using exec_callback_t = BPatchExecCallback; -using exit_callback_t = BPatchExitCallback; -using fork_callback_t = BPatchForkCallback; -using patch_t = BPatch; -using process_t = BPatch_process; -using thread_t = BPatch_thread; -using binary_edit_t = BPatch_binaryEdit; -using image_t = BPatch_image; -using module_t = BPatch_module; -using procedure_t = BPatch_function; -using snippet_t = BPatch_snippet; -using call_expr_t = BPatch_funcCallExpr; -using address_space_t = BPatch_addressSpace; -using flow_graph_t = BPatch_flowGraph; -using basic_loop_t = BPatch_basicBlockLoop; -using procedure_loc_t = BPatch_procedureLocation; -using point_t = BPatch_point; -using local_var_t = BPatch_localVar; -using const_expr_t = BPatch_constExpr; -using error_level_t = BPatchErrorLevel; -using patch_pointer_t = std::shared_ptr; -using snippet_pointer_t = std::shared_ptr; -using call_expr_pointer_t = std::shared_ptr; -using snippet_vec_t = bpvector_t; -using procedure_vec_t = bpvector_t; -using basic_loop_vec_t = bpvector_t; -using snippet_pointer_vec_t = std::vector; - -void -omnitrace_prefork_callback(thread_t* parent, thread_t* child); - -//======================================================================================// -// -// Global Variables -// -//======================================================================================// -// -// boolean settings -// -static bool use_return_info = false; -static bool use_args_info = false; -static bool use_file_info = false; -static bool use_line_info = false; -// -// integral settings -// -extern bool debug_print; -extern int verbose_level; -// -// string settings -// -static string_t main_fname = "main"; -static string_t argv0 = {}; -static string_t cmdv0 = {}; -static string_t default_components = "wall_clock"; -static string_t prefer_library = {}; -// -// global variables -// -static patch_pointer_t bpatch = {}; -static call_expr_t* terminate_expr = nullptr; -static snippet_vec_t init_names = {}; -static snippet_vec_t fini_names = {}; -static fmodset_t available_module_functions = {}; -static fmodset_t instrumented_module_functions = {}; -static fmodset_t overlapping_module_functions = {}; -static fmodset_t excluded_module_functions = {}; -static fixed_modset_t fixed_module_functions = {}; -static regexvec_t func_include = {}; -static regexvec_t func_exclude = {}; -static regexvec_t file_include = {}; -static regexvec_t file_exclude = {}; -static regexvec_t file_restrict = {}; -static regexvec_t func_restrict = {}; -// -//======================================================================================// - -// control debug printf statements -#define errprintf(LEVEL, ...) \ - { \ - if(werror || LEVEL < 0) \ - { \ - if(debug_print || verbose_level >= LEVEL) \ - fprintf(stderr, "[omnitrace][exe] Error! " __VA_ARGS__); \ - char _buff[FUNCNAMELEN]; \ - sprintf(_buff, "[omnitrace][exe] Error! " __VA_ARGS__); \ - throw std::runtime_error(std::string{ _buff }); \ - } \ - else \ - { \ - if(debug_print || verbose_level >= LEVEL) \ - fprintf(stderr, "[omnitrace][exe] Warning! " __VA_ARGS__); \ - } \ - fflush(stderr); \ - } - -// control verbose printf statements -#define verbprintf(LEVEL, ...) \ - { \ - if(debug_print || verbose_level >= LEVEL) \ - fprintf(stdout, "[omnitrace][exe] " __VA_ARGS__); \ - fflush(stdout); \ - } - -#define verbprintf_bare(LEVEL, ...) \ - { \ - if(debug_print || verbose_level >= LEVEL) fprintf(stdout, __VA_ARGS__); \ - fflush(stdout); \ - } - -//======================================================================================// - -template -void -consume_parameters(T&&...) -{} - -//======================================================================================// - -extern "C" -{ - bool are_file_include_exclude_lists_empty(); - bool instrument_module(const string_t& file_name); - bool instrument_entity(const string_t& function_name); - bool module_constraint(char* fname); - bool routine_constraint(const char* fname); -} - -//======================================================================================// - -strset_t -get_whole_function_names(); - -function_signature -get_func_file_line_info(module_t* mutatee_module, procedure_t* f); - -function_signature -get_loop_file_line_info(module_t* mutatee_module, procedure_t* f, flow_graph_t* cfGraph, - basic_loop_t* loopToInstrument); - -bool -query_instr(procedure_t* funcToInstr, procedure_loc_t traceLoc, - flow_graph_t* cfGraph = nullptr, basic_loop_t* loopToInstrument = nullptr, - bool allow_traps = true); - -template -bool -insert_instr(address_space_t* mutatee, procedure_t* funcToInstr, Tp traceFunc, - procedure_loc_t traceLoc, flow_graph_t* cfGraph = nullptr, - basic_loop_t* loopToInstrument = nullptr, bool allow_traps = true); - -void -errorFunc(error_level_t level, int num, const char** params); - -procedure_t* -find_function(image_t* appImage, const string_t& functionName, const strset_t& = {}); - -void -error_func_real(error_level_t level, int num, const char* const* params); - -void -error_func_fake(error_level_t level, int num, const char* const* params); +#include "function_signature.hpp" +#include "fwd.hpp" +#include "info.hpp" +#include "module_function.hpp" //======================================================================================// @@ -285,465 +64,6 @@ to_lower(string_t s) // //======================================================================================// // -struct function_signature -{ - using location_t = std::pair; - - bool m_loop = false; - bool m_info_beg = false; - bool m_info_end = false; - location_t m_row = { 0, 0 }; - location_t m_col = { 0, 0 }; - string_t m_return = {}; - string_t m_name = {}; - string_t m_params = "()"; - string_t m_file = {}; - mutable string_t m_signature = {}; - - TIMEMORY_DEFAULT_OBJECT(function_signature) - - template - void serialize(ArchiveT& _ar, const unsigned) - { - namespace cereal = tim::cereal; - (void) get(); - _ar(cereal::make_nvp("loop", m_loop), cereal::make_nvp("info_beg", m_info_beg), - cereal::make_nvp("info_end", m_info_end), cereal::make_nvp("row", m_row), - cereal::make_nvp("col", m_col), cereal::make_nvp("return", m_return), - cereal::make_nvp("name", m_name), cereal::make_nvp("params", m_params), - cereal::make_nvp("file", m_file), cereal::make_nvp("signature", m_signature)); - (void) get(); - } - - function_signature(string_t _ret, const string_t& _name, string_t _file, - location_t _row = { 0, 0 }, location_t _col = { 0, 0 }, - bool _loop = false, bool _info_beg = false, bool _info_end = false) - : m_loop(_loop) - , m_info_beg(_info_beg) - , m_info_end(_info_end) - , m_row(std::move(_row)) - , m_col(std::move(_col)) - , m_return(std::move(_ret)) - , m_name(tim::demangle(_name)) - , m_file(std::move(_file)) - { - if(m_file.find('/') != string_t::npos) - m_file = m_file.substr(m_file.find_last_of('/') + 1); - } - - function_signature(const string_t& _ret, const string_t& _name, const string_t& _file, - const std::vector& _params, location_t _row = { 0, 0 }, - location_t _col = { 0, 0 }, bool _loop = false, - bool _info_beg = false, bool _info_end = false) - : function_signature(_ret, _name, _file, _row, _col, _loop, _info_beg, _info_end) - { - m_params = "("; - for(const auto& itr : _params) - m_params.append(itr + ", "); - if(!_params.empty()) m_params = m_params.substr(0, m_params.length() - 2); - m_params += ")"; - } - - friend bool operator==(const function_signature& lhs, const function_signature& rhs) - { - return lhs.get() == rhs.get(); - } - - static auto get(function_signature& sig) { return sig.get(); } - - string_t get() const - { - std::stringstream ss; - if(use_return_info && !m_return.empty()) ss << m_return << " "; - ss << m_name; - if(use_args_info) ss << m_params; - if(m_loop && m_info_beg) - { - if(m_info_end) - { - ss << " [{" << m_row.first << "," << m_col.first << "}-{" << m_row.second - << "," << m_col.second << "}]"; - } - else - { - ss << "[{" << m_row.first << "," << m_col.first << "}]"; - } - } - if(use_file_info && m_file.length() > 0) ss << " [" << m_file; - if(use_line_info && m_row.first > 0) ss << ":" << m_row.first; - if(use_file_info && m_file.length() > 0) ss << "]"; - - m_signature = ss.str(); - return m_signature; - } -}; -// -//======================================================================================// -// -struct module_function -{ - using width_t = std::array; - using address_t = Dyninst::Address; - - static constexpr size_t absolute_max_width = 80; - - static auto& get_width() - { - static width_t _instance = []() { - width_t _tmp; - _tmp.fill(0); - return _tmp; - }(); - return _instance; - } - - TIMEMORY_DEFAULT_OBJECT(module_function) - - static void reset_width() { get_width().fill(0); } - - static void update_width(const module_function& rhs) - { - get_width()[0] = std::max(get_width()[0], rhs.module.length()); - get_width()[1] = std::max(get_width()[1], rhs.function.length()); - get_width()[2] = std::max(get_width()[2], rhs.signature.get().length()); - } - - module_function(string_t _module, string_t _func, function_signature _sign, - procedure_t* proc) - : module(std::move(_module)) - , function(std::move(_func)) - , signature(std::move(_sign)) - { - if(proc) - { - std::pair _range{}; - if(proc->getAddressRange(_range.first, _range.second)) - address_range = _range.second - _range.first; - } - } - - module_function(module_t* mod, procedure_t* proc) - { - char modname[FUNCNAMELEN]; - char fname[FUNCNAMELEN]; - - mod->getFullName(modname, FUNCNAMELEN); - proc->getName(fname, FUNCNAMELEN); - - module = modname; - function = fname; - signature = get_func_file_line_info(mod, proc); - if(!proc->isInstrumentable()) - { - verbprintf(0, - "Warning! module function generated for un-instrumentable " - "function: %s [%s]\n", - function.c_str(), module.c_str()); - } - std::pair _range{}; - if(proc->getAddressRange(_range.first, _range.second)) - address_range = _range.second - _range.first; - } - - friend bool operator<(const module_function& lhs, const module_function& rhs) - { - return (lhs.module == rhs.module) - ? ((lhs.function == rhs.function) - ? (lhs.signature.get() < rhs.signature.get()) - : (lhs.function < rhs.function)) - : (lhs.module < rhs.module); - } - - friend bool operator==(const module_function& lhs, const module_function& rhs) - { - return std::tie(lhs.module, lhs.function, lhs.signature, lhs.address_range) == - std::tie(rhs.module, rhs.function, rhs.signature, rhs.address_range); - } - - static void write_header(std::ostream& os) - { - auto w0 = std::min(get_width()[0], absolute_max_width); - auto w1 = std::min(get_width()[1], absolute_max_width); - auto w2 = std::min(get_width()[2], absolute_max_width); - - std::stringstream ss; - ss << std::setw(14) << "AddressRange" - << " " << std::setw(w0 + 8) << std::left << "Module" - << " " << std::setw(w1 + 8) << std::left << "Function" - << " " << std::setw(w2 + 8) << std::left << "FunctionSignature" - << "\n"; - os << ss.str(); - } - - friend std::ostream& operator<<(std::ostream& os, const module_function& rhs) - { - std::stringstream ss; - - auto w0 = std::min(get_width()[0], absolute_max_width); - auto w1 = std::min(get_width()[1], absolute_max_width); - auto w2 = std::min(get_width()[2], absolute_max_width); - - auto _get_str = [](const std::string& _inc) { - if(_inc.length() > absolute_max_width) - return _inc.substr(0, absolute_max_width - 3) + "..."; - return _inc; - }; - - // clang-format off - ss << std::setw(14) << rhs.address_range << " " - << std::setw(w0 + 8) << std::left << _get_str(rhs.module) << " " - << std::setw(w1 + 8) << std::left << _get_str(rhs.function) << " " - << std::setw(w2 + 8) << std::left << _get_str(rhs.signature.get()); - // clang-format on - - os << ss.str(); - return os; - } - - size_t address_range = 0; - string_t module = {}; - string_t function = {}; - function_signature signature = {}; - - template - void serialize(ArchiveT& _ar, const unsigned) - { - namespace cereal = tim::cereal; - _ar(cereal::make_nvp("address_range", address_range), - cereal::make_nvp("module", module), cereal::make_nvp("function", function), - cereal::make_nvp("signature", signature)); - } -}; -// -//======================================================================================// -// -static inline void -dump_info(std::ostream& _os, const fmodset_t& _data) -{ - module_function::reset_width(); - for(const auto& itr : _data) - module_function::update_width(itr); - - module_function::write_header(_os); - for(const auto& itr : _data) - _os << itr << '\n'; - - module_function::reset_width(); -} -// -template ::value, int> = 0> -static inline void -dump_info(ArchiveT& _ar, const fmodset_t& _data) -{ - _ar(tim::cereal::make_nvp("module_functions", _data)); -} -// -static inline void -dump_info(const string_t& _label, string_t _oname, const string_t& _ext, - const fmodset_t& _data, int _level, bool _fail) -{ - namespace cereal = tim::cereal; - namespace policy = tim::policy; - - _oname += "." + _ext; - auto _handle_error = [&]() { - std::stringstream _msg{}; - _msg << "[dump_info] Error opening '" << _oname << " for output"; - verbprintf(_level, "%s\n", _msg.str().c_str()); - if(_fail) - throw std::runtime_error(std::string{ "[omnitrace][exe]" } + _msg.str()); - }; - - if(!debug_print && verbose_level < _level) return; - - if(_ext == "txt") - { - std::ofstream ofs{}; - if(!tim::filepath::open(ofs, _oname)) - _handle_error(); - else - { - verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); - dump_info(ofs, _data); - verbprintf_bare(_level, "Done\n"); - } - ofs.close(); - } - else if(_ext == "xml") - { - std::stringstream oss{}; - { - using output_policy = policy::output_archive; - output_policy::indent() = true; - auto ar = output_policy::get(oss); - - ar->setNextName("omnitrace"); - ar->startNode(); - ar->setNextName(_label.c_str()); - ar->startNode(); - (*ar)(cereal::make_nvp("module_functions", _data)); - ar->finishNode(); - ar->finishNode(); - } - - std::ofstream ofs{}; - if(!tim::filepath::open(ofs, _oname)) - _handle_error(); - else - { - verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); - ofs << oss.str() << std::endl; - verbprintf_bare(_level, "Done\n"); - } - ofs.close(); - } - else if(_ext == "json") - { - std::stringstream oss{}; - { - using output_policy = policy::output_archive; - auto ar = output_policy::get(oss); - - ar->setNextName("omnitrace"); - ar->startNode(); - ar->setNextName(_label.c_str()); - ar->startNode(); - (*ar)(cereal::make_nvp("module_functions", _data)); - ar->finishNode(); - ar->finishNode(); - } - - std::ofstream ofs{}; - if(!tim::filepath::open(ofs, _oname)) - _handle_error(); - else - { - verbprintf(_level, "Outputting '%s'... ", _oname.c_str()); - ofs << oss.str() << std::endl; - verbprintf_bare(_level, "Done\n"); - } - ofs.close(); - } - else - { - throw std::runtime_error(TIMEMORY_JOIN( - "", "[omnitrace][exe] Error in ", __FUNCTION__, " :: filename '", _oname, - "' does not have one of recognized file extensions: txt, json, xml")); - } -} -// -static inline void -dump_info(const string_t& _oname, const fmodset_t& _data, int _level, bool _fail, - const string_t& _type, const strset_t& _ext) -{ - for(const auto& itr : _ext) - dump_info(_type, _oname, itr, _data, _level, _fail); -} -// -static inline void -load_info(const string_t& _label, const string_t& _iname, fmodset_t& _data, int _level) -{ - namespace cereal = tim::cereal; - namespace policy = tim::policy; - - auto _pos = _iname.find_last_of('.'); - std::string _ext = {}; - if(_pos != std::string::npos) _ext = _iname.substr(_pos + 1, _iname.length()); - - auto _handle_error = [&]() { - std::stringstream _msg{}; - _msg << "[load_info] Error opening '" << _iname << " for input"; - verbprintf(_level, "%s\n", _msg.str().c_str()); - throw std::runtime_error(std::string{ "[omnitrace][exe]" } + _msg.str()); - }; - - if(_ext == "xml") - { - verbprintf(_level, "Reading '%s'... ", _iname.c_str()); - std::ifstream ifs{ _iname }; - if(!ifs) - _handle_error(); - else - { - using input_policy = policy::input_archive; - auto ar = input_policy::get(ifs); - - ar->setNextName("omnitrace"); - ar->startNode(); - ar->setNextName(_label.c_str()); - ar->startNode(); - (*ar)(cereal::make_nvp("module_functions", _data)); - ar->finishNode(); - ar->finishNode(); - } - verbprintf_bare(_level, "Done\n"); - ifs.close(); - } - else if(_ext == "json") - { - verbprintf(_level, "Reading '%s'... ", _iname.c_str()); - std::ifstream ifs{ _iname }; - if(!ifs) - _handle_error(); - else - { - using input_policy = policy::input_archive; - auto ar = input_policy::get(ifs); - - ar->setNextName("omnitrace"); - ar->startNode(); - ar->setNextName(_label.c_str()); - ar->startNode(); - (*ar)(cereal::make_nvp("module_functions", _data)); - ar->finishNode(); - ar->finishNode(); - } - verbprintf_bare(_level, "Done\n"); - ifs.close(); - } - else - { - throw std::runtime_error(TIMEMORY_JOIN( - "", "[omnitrace][exe] Error in ", __FUNCTION__, " :: filename '", _iname, - "' does not have one of recognized extentions: txt, json, xml :: ", _ext)); - } -} -// -static inline void -load_info(const string_t& _inp, std::map& _data, int _level) -{ - std::vector _exceptions{}; - _exceptions.reserve(_data.size()); - for(auto& itr : _data) - { - try - { - fmodset_t _tmp{}; - load_info(itr.first, _inp, _tmp, _level); - // add to the existing - itr.second->insert(_tmp.begin(), _tmp.end()); - // if it did not throw it was successfully loaded - _exceptions.clear(); - break; - } catch(std::exception& _e) - { - _exceptions.emplace_back(_e.what()); - } - } - if(!_exceptions.empty()) - { - std::stringstream _msg{}; - for(auto& itr : _exceptions) - { - _msg << "[omnitrace][exe] " << itr << "\n"; - } - throw std::runtime_error(_msg.str()); - } -} -// -//======================================================================================// -// template ::value, int> = 0> snippet_pointer_t get_snippet(Tp arg) @@ -962,4 +282,79 @@ omnitrace_fork_callback(thread_t* parent, thread_t* child) } // //======================================================================================// +// insert_instr -- generic insert instrumentation function // +template +bool +insert_instr(address_space_t* mutatee, procedure_t* funcToInstr, Tp traceFunc, + procedure_loc_t traceLoc, flow_graph_t* cfGraph, + basic_loop_t* loopToInstrument, bool allow_traps) +{ + module_t* module = funcToInstr->getModule(); + if(!module || !traceFunc) return false; + + bpvector_t* _points = nullptr; + auto _trace = traceFunc.get(); + + if(cfGraph && loopToInstrument) + { + if(traceLoc == BPatch_entry) + _points = cfGraph->findLoopInstPoints(BPatch_locLoopEntry, loopToInstrument); + else if(traceLoc == BPatch_exit) + _points = cfGraph->findLoopInstPoints(BPatch_locLoopExit, loopToInstrument); + } + else + { + _points = funcToInstr->findPoint(traceLoc); + } + + if(_points == nullptr) return false; + if(_points->empty()) return false; + + /*if(loop_level_instr) + { + flow_graph_t* flow = funcToInstr->getCFG(); + bpvector_t basicLoop; + flow->getOuterLoops(basicLoop); + for(auto litr = basicLoop.begin(); litr != basicLoop.end(); ++litr) + { + bpvector_t* _tmp; + if(traceLoc == BPatch_entry) + _tmp = cfGraph->findLoopInstPoints(BPatch_locLoopEntry, *litr); + else if(traceLoc == BPatch_exit) + _tmp = cfGraph->findLoopInstPoints(BPatch_locLoopExit, *litr); + if(!_tmp) + continue; + for(auto& itr : *_tmp) + _points->push_back(itr); + } + }*/ + + // verbprintf(0, "Instrumenting |> [ %s ]\n", name.m_name.c_str()); + + std::set _traps{}; + if(!allow_traps) + { + for(auto& itr : *_points) + { + if(itr && itr->usesTrap_NP()) _traps.insert(itr); + } + } + + size_t _n = 0; + for(auto& itr : *_points) + { + if(!itr || _traps.count(itr) > 0) + continue; + else if(traceLoc == BPatch_entry) + mutatee->insertSnippet(*_trace, *itr, BPatch_callBefore, BPatch_firstSnippet); + // else if(traceLoc == BPatch_exit) + // mutatee->insertSnippet(*_trace, *itr, BPatch_callAfter, + // BPatch_firstSnippet); + else + mutatee->insertSnippet(*_trace, *itr); + ++_n; + } + + return (_n > 0); +} diff --git a/source/bin/tests/CMakeLists.txt b/source/bin/tests/CMakeLists.txt new file mode 100644 index 0000000000..a201897bc2 --- /dev/null +++ b/source/bin/tests/CMakeLists.txt @@ -0,0 +1,149 @@ +# adds a ctest for executable +function(OMNITRACE_ADD_BIN_TEST) + cmake_parse_arguments( + TEST + "" # options + "NAME;TARGET;TIMEOUT;WORKING_DIRECTORY" # single value args + "ARGS;ENVIRONMENT;LABELS;PROPERTIES;PASS_REGULAR_EXPRESSION;FAIL_REGULAR_EXPRESSION;SKIP_REGULAR_EXPRESSION;DEPENDS;COMMAND" # multiple + # value args + ${ARGN}) + + if(NOT OMNITRACE_DYNINST_API_RT_DIR AND OMNITRACE_DYNINST_API_RT) + get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}" + DIRECTORY) + endif() + + if(OMNITRACE_BUILD_DYNINST) + set(OMNITRACE_DYNINST_API_RT_DIR + "${PROJECT_BINARY_DIR}/external/dyninst/dyninstAPI_RT:${PROJECT_BINARY_DIR}/external/dyninst/dyninstAPI" + ) + endif() + + if(NOT TEST_ENVIRONMENT) + set(TEST_ENVIRONMENT + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" + ) + endif() + + list(APPEND TEST_ENVIRONMENT "OMNITRACE_CI=ON" + "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" + "OMNITRACE_OUTPUT_PREFIX=${TEST_NAME}/") + + if(TEST_COMMAND) + add_test( + NAME ${TEST_NAME} + COMMAND ${TEST_COMMAND} ${TEST_ARGS} + WORKING_DIRECTORY ${TEST_WORKING_DIRECTORY}) + + set_tests_properties( + ${TEST_NAME} + PROPERTIES ENVIRONMENT + "${TEST_ENVIRONMENT}" + TIMEOUT + ${TEST_TIMEOUT} + LABELS + "omnitrace-bin;${TEST_LABELS}" + PASS_REGULAR_EXPRESSION + "${TEST_PASS_REGULAR_EXPRESSION}" + FAIL_REGULAR_EXPRESSION + "${TEST_FAIL_REGULAR_EXPRESSION}" + SKIP_REGULAR_EXPRESSION + "${TEST_SKIP_REGULAR_EXPRESSION}" + ${TEST_PROPERTIES}) + elseif(TARGET ${TEST_TARGET}) + add_test( + NAME ${TEST_NAME} + COMMAND $ ${TEST_ARGS} + WORKING_DIRECTORY $) + + set_tests_properties( + ${TEST_NAME} + PROPERTIES ENVIRONMENT + "${TEST_ENVIRONMENT}" + TIMEOUT + ${TEST_TIMEOUT} + LABELS + "omnitrace-bin;${TEST_LABELS}" + PASS_REGULAR_EXPRESSION + "${TEST_PASS_REGULAR_EXPRESSION}" + FAIL_REGULAR_EXPRESSION + "${TEST_FAIL_REGULAR_EXPRESSION}" + SKIP_REGULAR_EXPRESSION + "${TEST_SKIP_REGULAR_EXPRESSION}" + ${TEST_PROPERTIES}) + elseif(OMNITRACE_BUILD_TESTING) + message(FATAL_ERROR "Error! ${TEST_TARGET} does not exist") + endif() +endfunction() + +omnitrace_add_bin_test( + NAME omnitrace-exe-help + TARGET omnitrace-exe + ARGS --help + LABELS omnitrace-exe + TIMEOUT 15 + PASS_REGULAR_EXPRESSION + ".*\\\[omnitrace\\\] Usage:.*\\\[DEBUG OPTIONS\\\].*\\\[MODE OPTIONS\\\].*\\\[LIBRARY OPTIONS\\\].*\\\[SYMBOL SELECTION OPTIONS\\\].*\\\[RUNTIME OPTIONS\\\].*\\\[GRANULARITY OPTIONS\\\].*\\\[DYNINST OPTIONS\\\].*" + ) + +omnitrace_add_bin_test( + NAME omnitrace-exe-simulate-ls + TARGET omnitrace-exe + ARGS --simulate --print-format json txt xml -- ls + TIMEOUT 60) + +omnitrace_add_bin_test( + NAME omnitrace-exe-simulate-ls-check + DEPENDS omnitrace-exe-simulate-ls + COMMAND ls + WORKING_DIRECTORY + ${PROJECT_BINARY_DIR}/omnitrace-tests-output/omnitrace-exe-simulate-ls + TIMEOUT 30 + PASS_REGULAR_EXPRESSION + ".*available-instr.json.*available-instr.txt.*available-instr.xml.*excluded-instr.json.*excluded-instr.txt.*excluded-instr.xml.*instrumented-instr.json.*instrumented-instr.txt.*instrumented-instr.xml.*overlapping-instr.json.*overlapping-instr.txt.*overlapping-instr.xml.*" + ) + +omnitrace_add_bin_test( + NAME omnitrace-avail-help + TARGET omnitrace-avail + ARGS --help + LABELS omnitrace-avail + TIMEOUT 15 + PASS_REGULAR_EXPRESSION + ".*\\\[omnitrace-avail\\\] Usage:.*\\\[CATEGORIES\\\].*\\\[VIEW OPTIONS\\\].*\\\[COLUMN OPTIONS\\\].*\\\[WIDTH OPTIONS\\\].*\\\[OUTPUT OPTIONS\\\].*" + ) + +omnitrace_add_bin_test( + NAME omnitrace-avail-filter-wall-clock-available + TARGET omnitrace-avail + ARGS -r wall_clock -C --available + LABELS omnitrace-avail + TIMEOUT 15 + PASS_REGULAR_EXPRESSION + "\\\|[-]+\\\|\n\\\|[ ]+COMPONENT[ ]+\\\|\n\\\|[-]+\\\|\n\\\| (wall_clock)[ ]+\\\|\n\\\| (sampling_wall_clock)[ ]+\\\|\n\\\|[-]+\\\|" + ) + +omnitrace_add_bin_test( + NAME omnitrace-avail-category-filer-omnitrace + TARGET omnitrace-avail + ARGS --categories settings::omnitrace --brief + LABELS omnitrace-avail + TIMEOUT 15 + PASS_REGULAR_EXPRESSION "OMNITRACE_(SETTINGS_DESC|OUTPUT_FILE|OUTPUT_PREFIX)" + FAIL_REGULAR_EXPRESSION + "OMNITRACE_(ADD_SECONDARY|SCIENTIFIC|PRECISION|MEMORY_PRECISION|TIMING_PRECISION)" + ) + +omnitrace_add_bin_test( + NAME omnitrace-avail-category-filer-timemory + TARGET omnitrace-avail + ARGS --categories settings::timemory --brief + LABELS omnitrace-avail + TIMEOUT 15 + PASS_REGULAR_EXPRESSION + "OMNITRACE_(ADD_SECONDARY|SCIENTIFIC|PRECISION|MEMORY_PRECISION|TIMING_PRECISION)" + FAIL_REGULAR_EXPRESSION "OMNITRACE_(SETTINGS_DESC|OUTPUT_FILE)") diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index cc4010dcc3..8df9b66904 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -1,186 +1,14 @@ -# ------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------- # # -# omnitrace interface library +# omnitrace: contains all instrumentation functionality # -# ------------------------------------------------------------------------------# - -add_library(omnitrace-interface-library INTERFACE) -add_library(omnitrace::omnitrace-interface-library ALIAS omnitrace-interface-library) - -target_include_directories( - omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_BINARY_DIR}/include) -target_include_directories(omnitrace-interface-library SYSTEM - INTERFACE ${perfetto_DIR}/sdk) - -target_compile_definitions( - omnitrace-interface-library - INTERFACE OMNITRACE_MAX_THREADS=${OMNITRACE_MAX_THREADS} - $,CUSTOM_DATA_SOURCE,>) - -target_link_libraries( - omnitrace-interface-library - INTERFACE $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $,omnitrace::omnitrace-sanitizer,>) - -# ------------------------------------------------------------------------------# +# omnitrace-dl: contains minimal symbols and dlopen's omnitrace # -# omnitrace object library +# omnitrace-user: contains symbols for user API # -# ------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------- # -add_library(omnitrace-object-library OBJECT) -add_library(omnitrace::omnitrace-object-library ALIAS omnitrace-object-library) - -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/library/defines.hpp.in - ${CMAKE_CURRENT_BINARY_DIR}/include/library/defines.hpp @ONLY) - -set(library_sources - ${CMAKE_CURRENT_LIST_DIR}/src/library.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/api.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/config.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/cpu_freq.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/critical_trace.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/kokkosp.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/gpu.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/perfetto.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/ptl.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/sampling.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/state.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/thread_data.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/thread_sampler.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/timemory.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/backtrace.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/fork_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/mpi_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/omnitrace.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/pthread_gotcha.cpp - ${perfetto_DIR}/sdk/perfetto.cc) - -set(library_headers - ${CMAKE_CURRENT_LIST_DIR}/include/library.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/api.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/config.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/common.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/cpu_freq.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/critical_trace.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/debug.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/gpu.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/perfetto.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/ptl.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/sampling.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/state.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/thread_data.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/thread_sampler.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/timemory.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fwd.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/backtrace.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fork_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/functors.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/mpi_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/omnitrace.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/rocm_smi.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer_callbacks.hpp - ${CMAKE_CURRENT_LIST_DIR}/include/library/components/pthread_gotcha.hpp - ${perfetto_DIR}/sdk/perfetto.h) - -target_sources(omnitrace-object-library PRIVATE ${library_sources} ${library_headers}) - -if(OMNITRACE_USE_ROCTRACER) - target_sources( - omnitrace-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer.cpp - ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer_callbacks.cpp) -endif() - -if(OMNITRACE_USE_ROCM_SMI) - target_sources(omnitrace-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src/library/components/rocm_smi.cpp) -endif() - -target_link_libraries(omnitrace-object-library PRIVATE omnitrace-interface-library) - -if(OMNITRACE_DYNINST_API_RT) - get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}" - DIRECTORY) -endif() - -# ------------------------------------------------------------------------------# -# -# omnitrace shared library -# -# ------------------------------------------------------------------------------# - -add_library(omnitrace-library SHARED $) -add_library(omnitrace::omnitrace-library ALIAS omnitrace-library) - -target_link_libraries(omnitrace-library PRIVATE omnitrace-interface-library) - -set_target_properties( - omnitrace-library - PROPERTIES OUTPUT_NAME omnitrace - VERSION ${PROJECT_VERSION} - SOVERSION ${PROJECT_VERSION_MAJOR} - INSTALL_RPATH - "\$ORIGIN:\$ORIGIN/timemory/libunwind:\$ORIGIN/dyninst-tpls/libs") - -install( - TARGETS omnitrace-library - DESTINATION ${CMAKE_INSTALL_LIBDIR} - OPTIONAL) - -# ------------------------------------------------------------------------------# -# -# omnitrace dl library -# -# ------------------------------------------------------------------------------# - -set(CMAKE_BUILD_TYPE "Release") -set(CMAKE_SKIP_RPATH OFF) -set(BUILD_RPATH_USE_ORIGIN ON) - -add_library(omnitrace-dl-library SHARED) -add_library(omnitrace::omnitrace-dl-library ALIAS omnitrace-dl-library) - -target_sources(omnitrace-dl-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/dl.cpp) -target_link_libraries(omnitrace-dl-library PRIVATE ${dl_LIBRARY}) - -check_cxx_compiler_flag("-fno-exceptions" omnitrace_dl_library_fno_exceptions) -check_cxx_compiler_flag("-ftls-model=local-dynamic" - omnitrace_dl_library_ftls_module_local_dynamic) - -if(OMNITRACE_BUILD_DEVELOPER) - if(omnitrace_dl_library_fno_exceptions) - target_compile_options(omnitrace-dl-library PRIVATE -fno-exceptions) - endif() - - if(omnitrace_dl_library_ftls_module_local_dynamic) - target_compile_options(omnitrace-dl-library PRIVATE -ftls-model=local-dynamic) - endif() -endif() - -set_target_properties( - omnitrace-dl-library - PROPERTIES OUTPUT_NAME omnitrace-dl - CXX_VISIBILITY_PRESET "protected" - VERSION ${PROJECT_VERSION} - SOVERSION ${PROJECT_VERSION_MAJOR} - POSITION_INDEPENDENT_CODE ON - BUILD_RPATH "\$ORIGIN" - INSTALL_RPATH "\$ORIGIN") - -install( - TARGETS omnitrace-dl-library - DESTINATION ${CMAKE_INSTALL_LIBDIR} - OPTIONAL) +add_subdirectory(common) +add_subdirectory(omnitrace) +add_subdirectory(omnitrace-dl) +add_subdirectory(omnitrace-user) diff --git a/source/lib/common/CMakeLists.txt b/source/lib/common/CMakeLists.txt new file mode 100644 index 0000000000..48a972dabc --- /dev/null +++ b/source/lib/common/CMakeLists.txt @@ -0,0 +1,22 @@ +# ------------------------------------------------------------------------------# +# +# omnitrace common headers +# +# ------------------------------------------------------------------------------# + +add_library(omnitrace-common-library INTERFACE) +add_library(omnitrace::common-library ALIAS omnitrace-common-library) + +target_sources( + omnitrace-common-library + INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/defines.h + ${CMAKE_CURRENT_SOURCE_DIR}/delimit.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/environment.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/join.hpp) + +get_filename_component(COMMON_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" DIRECTORY) + +target_include_directories(omnitrace-common-library INTERFACE ${COMMON_INCLUDE_DIR}) +target_compile_definitions(omnitrace-common-library + INTERFACE $) diff --git a/source/lib/common/defines.h b/source/lib/common/defines.h new file mode 100644 index 0000000000..f8ea232de3 --- /dev/null +++ b/source/lib/common/defines.h @@ -0,0 +1,36 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#define OMNITRACE_ATTRIBUTE(...) __attribute__((__VA_ARGS__)) +#define OMNITRACE_VISIBILITY(MODE) OMNITRACE_ATTRIBUTE(visibility(MODE)) +#define OMNITRACE_PUBLIC_API OMNITRACE_VISIBILITY("default") +#define OMNITRACE_HIDDEN_API OMNITRACE_VISIBILITY("hidden") +#define OMNITRACE_WEAK_API OMNITRACE_ATTRIBUTE(weak) +#define OMNITRACE_INLINE OMNITRACE_ATTRIBUTE(__always_inline__) + +#if defined(__cplusplus) +# if !defined(OMNITRACE_FOLD_EXPRESSION) +# define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...) +# endif +#endif diff --git a/source/lib/common/delimit.hpp b/source/lib/common/delimit.hpp new file mode 100644 index 0000000000..c489f24e7e --- /dev/null +++ b/source/lib/common/delimit.hpp @@ -0,0 +1,65 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include + +namespace omnitrace +{ +inline namespace common +{ +namespace +{ +template > +inline ContainerT +delimit(const std::string& line, const char* delimiters = "\"',;: "); + +template +inline ContainerT +delimit(const std::string& line, const char* delimiters) +{ + ContainerT _result{}; + size_t _beginp = 0; // position that is the beginning of the new string + size_t _delimp = 0; // position of the delimiter in the string + while(_beginp < line.length() && _delimp < line.length()) + { + // find the first character (starting at _delimp) that is not a delimiter + _beginp = line.find_first_not_of(delimiters, _delimp); + // if no a character after or at _end that is not a delimiter is not found + // then we are done + if(_beginp == std::string::npos) break; + // starting at the position of the new string, find the next delimiter + _delimp = line.find_first_of(delimiters, _beginp); + std::string _tmp{}; + // starting at the position of the new string, get the characters + // between this position and the next delimiter + _tmp = line.substr(_beginp, _delimp - _beginp); + // don't add empty strings + if(!_tmp.empty()) _result.emplace(_result.end(), _tmp); + } + return _result; +} +} // namespace +} // namespace common +} // namespace omnitrace diff --git a/source/lib/common/environment.hpp b/source/lib/common/environment.hpp new file mode 100644 index 0000000000..1133c06932 --- /dev/null +++ b/source/lib/common/environment.hpp @@ -0,0 +1,77 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include + +namespace omnitrace +{ +inline namespace common +{ +namespace +{ +inline std::string +get_env(const char* env_id, const char* _default) +{ + if(strlen(env_id) == 0) return _default; + char* env_var = ::std::getenv(env_id); + if(env_var) return std::string{ env_var }; + return _default; +} + +inline int +get_env(const char* env_id, int _default) +{ + if(strlen(env_id) == 0) return _default; + char* env_var = ::std::getenv(env_id); + if(env_var) return std::stoi(env_var); + return _default; +} + +inline bool +get_env(const char* env_id, bool _default) +{ + if(strlen(env_id) == 0) return _default; + char* env_var = ::std::getenv(env_id); + if(env_var) + { + if(std::string_view{ env_var }.find_first_not_of("0123456789") == + std::string_view::npos) + return static_cast(std::stoi(env_var)); + else + { + for(size_t i = 0; i < strlen(env_var); ++i) + env_var[i] = tolower(env_var[i]); + for(const auto& itr : { "off", "false", "no", "n", "f", "0" }) + if(strcmp(env_var, itr) == 0) return false; + } + return true; + } + return _default; +} +} // namespace +} // namespace common +} // namespace omnitrace diff --git a/source/lib/common/invoke.hpp b/source/lib/common/invoke.hpp new file mode 100644 index 0000000000..81b8046d34 --- /dev/null +++ b/source/lib/common/invoke.hpp @@ -0,0 +1,114 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "common/defines.h" +#include "common/join.hpp" + +#include +#include +#include +#include + +#if !defined(OMNITRACE_COMMON_LIBRARY_NAME) +# define OMNITRACE_COMMON_LIBRARY_NAME "common" +# error OMNITRACE_COMMON_LIBRARY_NAME must be defined +#endif + +namespace omnitrace +{ +inline namespace common +{ +namespace +{ +template +inline auto +invoke(const char* _name, FuncT&& _func, Args... _args) OMNITRACE_HIDDEN_API; + +inline int32_t& +get_guard() +{ + static thread_local int32_t _v = 0; + return _v; +} + +inline int64_t +get_thread_index() +{ + static std::atomic _c{ 0 }; + static thread_local auto _v = _c++; + return _v; +} + +template +auto +invoke(const char* _name, int _verbose, FuncT&& _func, Args... _args) +{ + if(_func) + { + struct decrement_guard + { + // decrement the guard as it exits the scope + ~decrement_guard() { --get_guard(); } + } _unlk{}; + + // if _lk is ever greater than zero on the same thread, this + // means a function within the current function is calling + // our instrumentation so we ignore the call + int32_t _lk = get_guard()++; + if(_lk == 0) + { + if(_verbose > 3) + { + fflush(stderr); + fprintf(stderr, + "[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME "][%li] %s(%s)\n", + get_thread_index(), _name, join(", ", _args...).c_str()); + fflush(stderr); + } + return std::invoke(std::forward(_func), _args...); + } + else if(_verbose > 2) + { + fflush(stderr); + fprintf(stderr, + "[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME + "][%li] %s(%s) was guarded :: value = %i\n", + get_thread_index(), _name, join(", ", _args...).c_str(), _lk); + fflush(stderr); + } + } + else if(_verbose >= 0) + { + fprintf(stderr, + "[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME + "][%li] %s(%s) ignored :: null function pointer\n", + get_thread_index(), _name, join(", ", _args...).c_str()); + } + + using return_type = decltype(std::invoke(std::forward(_func), _args...)); + if constexpr(!std::is_void::value) return return_type(); +} +} // namespace +} // namespace common +} // namespace omnitrace diff --git a/source/lib/common/join.hpp b/source/lib/common/join.hpp new file mode 100644 index 0000000000..a268452a8c --- /dev/null +++ b/source/lib/common/join.hpp @@ -0,0 +1,55 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include + +#if !defined(OMNITRACE_FOLD_EXPRESSION) +# define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...) +#endif + +namespace omnitrace +{ +inline namespace common +{ +namespace +{ +template +auto +join(DelimT&& _delim, Args&&... _args) +{ + std::stringstream _ss{}; + OMNITRACE_FOLD_EXPRESSION(_ss << _delim << _args); + if constexpr(std::is_same::value) + { + return _ss.str().substr(1); + } + else + { + return _ss.str().substr(std::string{ _delim }.length()); + } +} +} // namespace +} // namespace common +} // namespace omnitrace diff --git a/source/lib/omnitrace-dl/CMakeLists.txt b/source/lib/omnitrace-dl/CMakeLists.txt new file mode 100644 index 0000000000..f7de41496e --- /dev/null +++ b/source/lib/omnitrace-dl/CMakeLists.txt @@ -0,0 +1,55 @@ +# ------------------------------------------------------------------------------# +# +# omnitrace dl library +# +# ------------------------------------------------------------------------------# + +set(CMAKE_BUILD_TYPE "Release") +set(CMAKE_SKIP_RPATH OFF) +set(BUILD_RPATH_USE_ORIGIN ON) + +add_library(omnitrace-dl-library SHARED) +add_library(omnitrace::omnitrace-dl-library ALIAS omnitrace-dl-library) + +target_sources(omnitrace-dl-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/dl.cpp) +target_link_libraries(omnitrace-dl-library PRIVATE ${dl_LIBRARY} + omnitrace::common-library) + +check_cxx_compiler_flag("-fno-exceptions" omnitrace_dl_library_fno_exceptions) +check_cxx_compiler_flag("-ftls-model=local-dynamic" + omnitrace_dl_library_ftls_module_local_dynamic) + +if(OMNITRACE_BUILD_DEVELOPER) + if(omnitrace_dl_library_fno_exceptions) + target_compile_options(omnitrace-dl-library PRIVATE -fno-exceptions) + endif() + + if(omnitrace_dl_library_ftls_module_local_dynamic) + target_compile_options(omnitrace-dl-library PRIVATE -ftls-model=local-dynamic) + endif() +endif() + +omnitrace_target_compile_definitions( + omnitrace-dl-library PRIVATE OMNITRACE_USE_OMPT=$) + +set_target_properties( + omnitrace-dl-library + PROPERTIES OUTPUT_NAME omnitrace-dl + CXX_VISIBILITY_PRESET "protected" + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_VERSION_MAJOR} + POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN") + +install( + TARGETS omnitrace-dl-library + EXPORT omnitrace-dl-library-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR} + OPTIONAL) + +install( + EXPORT omnitrace-dl-library-targets + FILE omnitrace-dl-library-targets.cmake + NAMESPACE omnitrace:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/cmake/omnitrace) diff --git a/source/lib/src/dl.cpp b/source/lib/omnitrace-dl/dl.cpp similarity index 50% rename from source/lib/src/dl.cpp rename to source/lib/omnitrace-dl/dl.cpp index 5f07558558..c068e5f92a 100644 --- a/source/lib/src/dl.cpp +++ b/source/lib/omnitrace-dl/dl.cpp @@ -22,6 +22,15 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#define OMNITRACE_COMMON_LIBRARY_NAME "dl" + +#include "common/defines.h" +#include "common/delimit.hpp" +#include "common/environment.hpp" +#include "common/invoke.hpp" +#include "common/join.hpp" + +#include #include #include #include @@ -38,15 +47,15 @@ #include #include -#define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...) -#define OMNITRACE_VISIBLE __attribute__((visibility("default"))) -#define OMNITRACE_HIDDEN __attribute__((visibility("internal"))) -#define OMNITRACE_INLINE __attribute__((__always_inline__)) +#if !defined(OMNITRACE_USE_OMPT) +# define OMNITRACE_USE_OMPT 0 +#endif + #define OMNITRACE_DLSYM(VARNAME, HANDLE, FUNCNAME) \ if(HANDLE) \ { \ *(void**) (&VARNAME) = dlsym(HANDLE, FUNCNAME); \ - if(VARNAME == nullptr && _omnitrace_dl_verbose >= 0) \ + if(VARNAME == nullptr && _omnitrace_dl_verbose >= _warn_verbose) \ { \ fprintf(stderr, "[omnitrace][dl][pid=%i]> %s :: %s\n", getpid(), FUNCNAME, \ dlerror()); \ @@ -61,50 +70,45 @@ extern "C" { - void omnitrace_init_library(void) OMNITRACE_VISIBLE; - void omnitrace_init(const char*, bool, const char*) OMNITRACE_VISIBLE; - void omnitrace_finalize(void) OMNITRACE_VISIBLE; - void omnitrace_set_env(const char* env_name, const char* env_val) OMNITRACE_VISIBLE; - void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_VISIBLE; - void omnitrace_push_trace(const char* name) OMNITRACE_VISIBLE; - void omnitrace_pop_trace(const char* name) OMNITRACE_VISIBLE; + struct ompt_start_tool_result_t; + + void omnitrace_init_library(void) OMNITRACE_PUBLIC_API; + void omnitrace_init(const char*, bool, const char*) OMNITRACE_PUBLIC_API; + void omnitrace_finalize(void) OMNITRACE_PUBLIC_API; + void omnitrace_set_env(const char* env_name, + const char* env_val) OMNITRACE_PUBLIC_API; + void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_PUBLIC_API; + void omnitrace_push_trace(const char* name) OMNITRACE_PUBLIC_API; + void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API; + + void omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API; + void omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API; + + void omnitrace_user_start_thread_trace_dl(void) OMNITRACE_HIDDEN_API; + void omnitrace_user_stop_thread_trace_dl(void) OMNITRACE_HIDDEN_API; + + void omnitrace_user_push_region_dl(const char*) OMNITRACE_HIDDEN_API; + void omnitrace_user_pop_region_dl(const char*) OMNITRACE_HIDDEN_API; + + ompt_start_tool_result_t* ompt_start_tool(unsigned int, + const char*) OMNITRACE_PUBLIC_API; } //--------------------------------------------------------------------------------------// -inline namespace omnitrace +namespace omnitrace { inline namespace dl { namespace { inline int -get_omnitrace_env(); - -bool -get_env(const char* env_id, bool _default); - -std::string -get_env(const char* env_id, const char* _default); - -int -get_env(const char* env_id, int _default); - -template -auto -join(DelimT&& _delim, Args&&... _args) +get_omnitrace_env() { - std::stringstream _ss{}; - OMNITRACE_FOLD_EXPRESSION(_ss << _delim << _args); - if constexpr(std::is_same::value) - { - return _ss.str().substr(1); - } - else - { - return _ss.str().substr(std::string{ _delim }.length()); - } + auto&& _debug = get_env("OMNITRACE_DEBUG", false); + return get_env("OMNITRACE_VERBOSE", (_debug) ? 100 : 0); } + // environment priority: // - OMNITRACE_DL_DEBUG // - OMNITRACE_DL_VERBOSE @@ -114,10 +118,6 @@ int _omnitrace_dl_verbose = get_env("OMNITRACE_DL_DEBUG", false) ? 100 : get_env("OMNITRACE_DL_VERBOSE", get_omnitrace_env()); -template > -inline ContainerT -delimit(const std::string& line, const char* delimiters = "\"',;: "); - // The docs for dlopen suggest that the combination of RTLD_LOCAL + RTLD_DEEPBIND // (when available) helps ensure that the symbols in the instrumentation library // libomnitrace.so will use it's own symbols... not symbols that are potentially @@ -162,39 +162,38 @@ const char* _omnitrace_dl_dlopen_descr = "RTLD_LAZY | RTLD_LOCAL"; #endif /// This class contains function pointers for omnitrace's instrumentation functions -struct OMNITRACE_HIDDEN indirect +struct OMNITRACE_HIDDEN_API indirect { - explicit OMNITRACE_INLINE indirect(std::string libpath) - : m_libpath{ find_path(std::move(libpath)) } + OMNITRACE_INLINE indirect(std::string omnilib, std::string userlib) + : m_omnilib{ find_path(std::move(omnilib)) } + , m_userlib{ find_path(std::move(userlib)) } { if(_omnitrace_dl_verbose > 0) { fprintf(stderr, "[omnitrace][dl][pid=%i] libomnitrace.so resolved to '%s'\n", - getpid(), m_libpath.c_str()); + getpid(), m_omnilib.c_str()); } - auto _omni_hsa_lib = m_libpath; + auto _omni_hsa_lib = m_omnilib; const char* _hsa_lib = getenv("HSA_TOOLS_LIB"); if(_hsa_lib) _omni_hsa_lib.append(":").append(_hsa_lib); setenv("HSA_TOOLS_LIB", _omni_hsa_lib.c_str(), 1); - open(); + m_omnihandle = open(m_omnilib); + m_userhandle = open(m_userlib); init(); } - OMNITRACE_INLINE ~indirect() { dlclose(m_libhandle); } + OMNITRACE_INLINE ~indirect() { dlclose(m_omnihandle); } - OMNITRACE_INLINE void open() + static OMNITRACE_INLINE void* open(const std::string& _lib) { - if(m_libhandle) return; - - auto* libhandle = dlopen(m_libpath.c_str(), _omnitrace_dl_dlopen_flags); + auto* libhandle = dlopen(_lib.c_str(), _omnitrace_dl_dlopen_flags); if(libhandle) { - m_libhandle = libhandle; if(_omnitrace_dl_verbose > 0) { fprintf(stderr, "[omnitrace][dl][pid=%i] dlopen(%s, %s) :: success\n", - getpid(), m_libpath.c_str(), _omnitrace_dl_dlopen_descr); + getpid(), _lib.c_str(), _omnitrace_dl_dlopen_descr); } } else @@ -203,26 +202,48 @@ struct OMNITRACE_HIDDEN indirect { perror("dlopen"); fprintf(stderr, "[omnitrace][dl][pid=%i] dlopen(%s, %s) :: %s\n", - getpid(), m_libpath.c_str(), _omnitrace_dl_dlopen_descr, - dlerror()); + getpid(), _lib.c_str(), _omnitrace_dl_dlopen_descr, dlerror()); } } dlerror(); // Clear any existing error + + return libhandle; } OMNITRACE_INLINE void init() { - if(!m_libhandle) open(); + if(!m_omnihandle) m_omnihandle = open(m_omnilib); + int _warn_verbose = 0; // Initialize all pointers - OMNITRACE_DLSYM(omnitrace_init_library_f, m_libhandle, "omnitrace_init_library"); - OMNITRACE_DLSYM(omnitrace_init_f, m_libhandle, "omnitrace_init"); - OMNITRACE_DLSYM(omnitrace_finalize_f, m_libhandle, "omnitrace_finalize"); - OMNITRACE_DLSYM(omnitrace_set_env_f, m_libhandle, "omnitrace_set_env"); - OMNITRACE_DLSYM(omnitrace_set_mpi_f, m_libhandle, "omnitrace_set_mpi"); - OMNITRACE_DLSYM(omnitrace_push_trace_f, m_libhandle, "omnitrace_push_trace"); - OMNITRACE_DLSYM(omnitrace_pop_trace_f, m_libhandle, "omnitrace_pop_trace"); + OMNITRACE_DLSYM(omnitrace_init_library_f, m_omnihandle, "omnitrace_init_library"); + OMNITRACE_DLSYM(omnitrace_init_f, m_omnihandle, "omnitrace_init"); + OMNITRACE_DLSYM(omnitrace_finalize_f, m_omnihandle, "omnitrace_finalize"); + OMNITRACE_DLSYM(omnitrace_set_env_f, m_omnihandle, "omnitrace_set_env"); + OMNITRACE_DLSYM(omnitrace_set_mpi_f, m_omnihandle, "omnitrace_set_mpi"); + OMNITRACE_DLSYM(omnitrace_push_trace_f, m_omnihandle, "omnitrace_push_trace"); + OMNITRACE_DLSYM(omnitrace_pop_trace_f, m_omnihandle, "omnitrace_pop_trace"); + OMNITRACE_DLSYM(omnitrace_push_region_f, m_omnihandle, "omnitrace_push_region"); + OMNITRACE_DLSYM(omnitrace_pop_region_f, m_omnihandle, "omnitrace_pop_region"); +#if OMNITRACE_USE_OMPT == 0 + _warn_verbose = 5; +#endif + OMNITRACE_DLSYM(ompt_start_tool_f, m_omnihandle, "ompt_start_tool"); + + if(!m_userhandle) m_userhandle = open(m_userlib); + _warn_verbose = 0; + OMNITRACE_DLSYM(omnitrace_user_configure_f, m_userhandle, + "omnitrace_user_configure"); + + if(omnitrace_user_configure_f) + { + (*omnitrace_user_configure_f)( + &omnitrace_user_start_trace_dl, &omnitrace_user_stop_trace_dl, + &omnitrace_user_start_thread_trace_dl, + &omnitrace_user_stop_thread_trace_dl, &omnitrace_user_push_region_dl, + &omnitrace_user_pop_region_dl); + } } static OMNITRACE_INLINE std::string find_path(std::string&& _path) @@ -246,196 +267,170 @@ struct OMNITRACE_HIDDEN indirect } public: - void (*omnitrace_init_library_f)(void) = nullptr; - void (*omnitrace_init_f)(const char*, bool, const char*) = nullptr; - void (*omnitrace_finalize_f)(void) = nullptr; - void (*omnitrace_set_env_f)(const char*, const char*) = nullptr; - void (*omnitrace_set_mpi_f)(bool, bool) = nullptr; - void (*omnitrace_push_trace_f)(const char*) = nullptr; - void (*omnitrace_pop_trace_f)(const char*) = nullptr; + void (*omnitrace_init_library_f)(void) = nullptr; + void (*omnitrace_init_f)(const char*, bool, const char*) = nullptr; + void (*omnitrace_finalize_f)(void) = nullptr; + void (*omnitrace_set_env_f)(const char*, const char*) = nullptr; + void (*omnitrace_set_mpi_f)(bool, bool) = nullptr; + void (*omnitrace_push_trace_f)(const char*) = nullptr; + void (*omnitrace_pop_trace_f)(const char*) = nullptr; + void (*omnitrace_push_region_f)(const char*) = nullptr; + void (*omnitrace_pop_region_f)(const char*) = nullptr; + void (*omnitrace_user_configure_f)(void (*)(void), void (*)(void), void (*)(void), + void (*)(void), void (*)(const char*), + void (*)(const char*)) = nullptr; + ompt_start_tool_result_t* (*ompt_start_tool_f)(unsigned int, const char*); private: - void* m_libhandle = nullptr; - std::string m_libpath = {}; + void* m_omnihandle = nullptr; + void* m_userhandle = nullptr; + std::string m_omnilib = {}; + std::string m_userlib = {}; }; inline indirect& -get_indirect() OMNITRACE_HIDDEN; +get_indirect() OMNITRACE_HIDDEN_API; -template -inline void -invoke(const char* _name, FuncT&& _func, Args... _args) OMNITRACE_HIDDEN; +indirect& +get_indirect() +{ + static auto _v = + indirect{ get_env("OMNITRACE_LIBRARY", "libomnitrace.so"), + get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so") }; + return _v; +} + +auto& +get_active() +{ + static bool _v = false; + return _v; +} + +auto& +get_enabled() +{ + static std::atomic _v{ get_env("OMNITRACE_INIT_ENABLED", true) }; + return _v; +} + +auto& +get_thread_enabled() +{ + static thread_local bool _v = get_enabled(); + return _v; +} + +auto& +get_count() +{ + static std::atomic _v{ 0 }; + return _v; +} + +auto& +get_thread_count() +{ + static thread_local int64_t _v = 0; + return _v; +} } // namespace } // namespace dl } // namespace omnitrace //--------------------------------------------------------------------------------------// +#define OMNITRACE_DL_INVOKE(...) \ + ::omnitrace::common::invoke(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \ + __VA_ARGS__) + +using omnitrace::get_indirect; +namespace dl = omnitrace::dl; + extern "C" { void omnitrace_init_library(void) { - invoke(__FUNCTION__, get_indirect().omnitrace_init_library_f); + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_init_library_f); } void omnitrace_init(const char* a, bool b, const char* c) { - invoke(__FUNCTION__, get_indirect().omnitrace_init_f, a, b, c); + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_init_f, a, b, c); + dl::get_active() = true; } void omnitrace_finalize(void) { - invoke(__FUNCTION__, get_indirect().omnitrace_finalize_f); + dl::get_active() = false; + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_finalize_f); } void omnitrace_push_trace(const char* name) { - invoke(__FUNCTION__, get_indirect().omnitrace_push_trace_f, name); + if(!dl::get_active()) return; + if(dl::get_thread_enabled()) + { + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_push_trace_f, name); + } + else + { + ++dl::get_thread_count(); + } } void omnitrace_pop_trace(const char* name) { - invoke(__FUNCTION__, get_indirect().omnitrace_pop_trace_f, name); + if(!dl::get_active()) return; + if(dl::get_thread_enabled()) + { + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_pop_trace_f, name); + } + else + { + if(dl::get_thread_count()-- == 0) omnitrace_user_start_thread_trace_dl(); + } } void omnitrace_set_env(const char* a, const char* b) { setenv(a, b, 0); - invoke(__FUNCTION__, get_indirect().omnitrace_set_env_f, a, b); + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_env_f, a, b); } void omnitrace_set_mpi(bool a, bool b) { - invoke(__FUNCTION__, get_indirect().omnitrace_set_mpi_f, a, b); + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_mpi_f, a, b); } -} -//--------------------------------------------------------------------------------------// + void omnitrace_user_start_trace_dl(void) { dl::get_enabled().store(true); } + void omnitrace_user_stop_trace_dl(void) { dl::get_enabled().store(false); } + void omnitrace_user_start_thread_trace_dl(void) { dl::get_thread_enabled() = true; } + void omnitrace_user_stop_thread_trace_dl(void) { dl::get_thread_enabled() = false; } -inline namespace omnitrace -{ -inline namespace dl -{ -namespace -{ -template -inline ContainerT -delimit(const std::string& line, const char* delimiters) -{ - ContainerT _result{}; - size_t _beginp = 0; // position that is the beginning of the new string - size_t _delimp = 0; // position of the delimiter in the string - while(_beginp < line.length() && _delimp < line.length()) + void omnitrace_user_push_region_dl(const char* name) { - // find the first character (starting at _delimp) that is not a delimiter - _beginp = line.find_first_not_of(delimiters, _delimp); - // if no a character after or at _end that is not a delimiter is not found - // then we are done - if(_beginp == std::string::npos) break; - // starting at the position of the new string, find the next delimiter - _delimp = line.find_first_of(delimiters, _beginp); - std::string _tmp{}; - // starting at the position of the new string, get the characters - // between this position and the next delimiter - _tmp = line.substr(_beginp, _delimp - _beginp); - // don't add empty strings - if(!_tmp.empty()) _result.emplace(_result.end(), _tmp); + if(!dl::get_active()) return; + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_push_region_f, name); } - return _result; -} -std::string -get_env(const char* env_id, const char* _default) -{ - if(strlen(env_id) == 0) return _default; - char* env_var = ::std::getenv(env_id); - if(env_var) return std::string{ env_var }; - return _default; -} - -int -get_env(const char* env_id, int _default) -{ - if(strlen(env_id) == 0) return _default; - char* env_var = ::std::getenv(env_id); - if(env_var) return std::stoi(env_var); - return _default; -} - -bool -get_env(const char* env_id, bool _default) -{ - if(strlen(env_id) == 0) return _default; - char* env_var = ::std::getenv(env_id); - if(env_var) + void omnitrace_user_pop_region_dl(const char* name) { - if(std::string{ env_var }.find_first_not_of("0123456789") == std::string::npos) - return static_cast(std::stoi(env_var)); - else - { - for(size_t i = 0; i < strlen(env_var); ++i) - env_var[i] = tolower(env_var[i]); - for(const auto& itr : { "off", "false", "no", "n", "f", "0" }) - if(strcmp(env_var, itr) == 0) return false; - } - return true; + if(!dl::get_active()) return; + OMNITRACE_DL_INVOKE(get_indirect().omnitrace_pop_region_f, name); } - return _default; -} -int -get_omnitrace_env() -{ - auto&& _debug = get_env("OMNITRACE_DEBUG", false); - return get_env("OMNITRACE_VERBOSE", (_debug) ? 100 : 0); -} - -indirect& -get_indirect() -{ - static auto _v = indirect{ get_env("OMNITRACE_LIBRARY", "libomnitrace.so") }; - return _v; -} - -int32_t& -get_guard() -{ - static thread_local int32_t _v = 0; - return _v; -} - -template -void -invoke(const char* _name, FuncT&& _func, Args... _args) -{ - if(_func) + ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version, + const char* runtime_version) { - // if _lk is ever greater than zero on the same thread, this - // means a function within the current function is calling - // our instrumentation so we ignore the call - int32_t _lk = get_guard()++; - if(_lk == 0) - { - std::invoke(std::forward(_func), _args...); - } - else if(_omnitrace_dl_verbose > 2) - { - fflush(stderr); - fprintf(stderr, "[omnitrace][dl] call to %s was guarded :: value = %i\n", - _name, _lk); - fflush(stderr); - } - // decrement the guard as it exits the scope - --get_guard(); - } - else if(_omnitrace_dl_verbose >= 0) - { - fprintf(stderr, "[omnitrace][dl] %s\n", - join("", "null function pointer to ", _name, ". Ignoring ", _name, "(", - _args..., ")") - .c_str()); +#if OMNITRACE_USE_OMPT == 0 + (void) omp_version; + (void) runtime_version; + return nullptr; +#else + if(!omnitrace::common::get_env("OMNITRACE_USE_OMPT", true)) return nullptr; + return OMNITRACE_DL_INVOKE(get_indirect().ompt_start_tool_f, omp_version, + runtime_version); +#endif } } -} // namespace -} // namespace dl -} // namespace omnitrace diff --git a/source/lib/omnitrace-user/CMakeLists.txt b/source/lib/omnitrace-user/CMakeLists.txt new file mode 100644 index 0000000000..7362fe7bd5 --- /dev/null +++ b/source/lib/omnitrace-user/CMakeLists.txt @@ -0,0 +1,61 @@ +# ------------------------------------------------------------------------------# +# +# omnitrace user library +# +# ------------------------------------------------------------------------------# + +set(CMAKE_BUILD_TYPE "Release") +set(CMAKE_SKIP_RPATH OFF) +set(BUILD_RPATH_USE_ORIGIN ON) +set(CMAKE_CXX_VISIBILITY_PRESET "hidden") +set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) + +add_library(omnitrace-user-library SHARED) +add_library(omnitrace::omnitrace-user-library ALIAS omnitrace-user-library) + +target_sources( + omnitrace-user-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/user.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/omnitrace/user.h) +target_include_directories( + omnitrace-user-library PUBLIC $ + $) + +check_cxx_compiler_flag("-fno-exceptions" omnitrace_user_library_fno_exceptions) +check_cxx_compiler_flag("-ftls-model=local-dynamic" + omnitrace_user_library_ftls_module_local_dynamic) + +if(OMNITRACE_BUILD_DEVELOPER) + if(omnitrace_user_library_fno_exceptions) + target_compile_options(omnitrace-user-library PRIVATE -fno-exceptions) + endif() + + if(omnitrace_user_library_ftls_module_local_dynamic) + target_compile_options(omnitrace-user-library PRIVATE -ftls-model=local-dynamic) + endif() +endif() + +set_target_properties( + omnitrace-user-library + PROPERTIES OUTPUT_NAME omnitrace-user + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_VERSION_MAJOR} + POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN") + +install( + TARGETS omnitrace-user-library + EXPORT omnitrace-user-library-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR} + OPTIONAL) + +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/omnitrace/user.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/omnitrace + OPTIONAL) + +install( + EXPORT omnitrace-user-library-targets + FILE omnitrace-user-library-targets.cmake + NAMESPACE omnitrace:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/cmake/omnitrace) diff --git a/source/lib/omnitrace-user/omnitrace/user.h b/source/lib/omnitrace-user/omnitrace/user.h new file mode 100644 index 0000000000..a5f554399e --- /dev/null +++ b/source/lib/omnitrace-user/omnitrace/user.h @@ -0,0 +1,50 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef OMNITRACE_USER_H_ +#define OMNITRACE_USER_H_ 1 + +#define OMNITRACE_ATTRIBUTE(...) __attribute__((__VA_ARGS__)) +#define OMNITRACE_VISIBILITY(MODE) OMNITRACE_ATTRIBUTE(visibility(MODE)) +#define OMNITRACE_PUBLIC_API OMNITRACE_VISIBILITY("default") +#define OMNITRACE_HIDDEN_API OMNITRACE_VISIBILITY("hidden") + +#if defined(__cplusplus) +extern "C" +{ +#endif + extern void omnitrace_user_start_trace(void) OMNITRACE_PUBLIC_API; + extern void omnitrace_user_stop_trace(void) OMNITRACE_PUBLIC_API; + extern void omnitrace_user_start_thread_trace(void) OMNITRACE_PUBLIC_API; + extern void omnitrace_user_stop_thread_trace(void) OMNITRACE_PUBLIC_API; + extern void omnitrace_user_push_region(const char*) OMNITRACE_PUBLIC_API; + extern void omnitrace_user_pop_region(const char*) OMNITRACE_PUBLIC_API; + + extern void omnitrace_user_configure(void (*)(void), void (*)(void), void (*)(void), + void (*)(void), void (*)(const char*), + void (*)(const char*)) OMNITRACE_PUBLIC_API; + +#if defined(__cplusplus) +} +#endif + +#endif // OMNITRACE_USER_H_ diff --git a/source/lib/omnitrace-user/user.cpp b/source/lib/omnitrace-user/user.cpp new file mode 100644 index 0000000000..1bc2c317bf --- /dev/null +++ b/source/lib/omnitrace-user/user.cpp @@ -0,0 +1,84 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#define OMNITRACE_SOURCE 1 + +#include "omnitrace/user.h" + +#include + +namespace +{ +using void_func_t = void (*)(void); +using region_func_t = void (*)(const char*); + +void_func_t _start_trace = nullptr; +void_func_t _stop_trace = nullptr; +void_func_t _start_thread_trace = nullptr; +void_func_t _stop_thread_trace = nullptr; +region_func_t _push_region = nullptr; +region_func_t _pop_region = nullptr; +} // namespace + +extern "C" +{ + void omnitrace_user_start_trace(void) + { + if(_start_trace) (*_start_trace)(); + } + + void omnitrace_user_stop_trace(void) + { + if(_stop_trace) (*_stop_trace)(); + } + + void omnitrace_user_start_thread_trace(void) + { + if(_start_thread_trace) (*_start_thread_trace)(); + } + + void omnitrace_user_stop_thread_trace(void) + { + if(_stop_thread_trace) (*_stop_thread_trace)(); + } + + void omnitrace_user_push_region(const char* name) + { + if(_push_region) (*_push_region)(name); + } + + void omnitrace_user_pop_region(const char* name) + { + if(_pop_region) (*_pop_region)(name); + } + + void omnitrace_user_configure(void (*_a)(), void (*_b)(), void (*_c)(), void (*_d)(), + void (*_e)(const char*), void (*_f)(const char*)) + { + _start_trace = _a; + _stop_trace = _b; + _start_thread_trace = _c; + _stop_thread_trace = _d; + _push_region = _e; + _pop_region = _f; + } +} diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt new file mode 100644 index 0000000000..453f8f502b --- /dev/null +++ b/source/lib/omnitrace/CMakeLists.txt @@ -0,0 +1,145 @@ +# ------------------------------------------------------------------------------# +# +# omnitrace interface library +# +# ------------------------------------------------------------------------------# + +add_library(omnitrace-interface-library INTERFACE) +add_library(omnitrace::omnitrace-interface-library ALIAS omnitrace-interface-library) + +target_include_directories( + omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_BINARY_DIR}/include) +target_include_directories(omnitrace-interface-library SYSTEM + INTERFACE ${perfetto_DIR}/sdk) + +target_compile_definitions( + omnitrace-interface-library + INTERFACE OMNITRACE_MAX_THREADS=${OMNITRACE_MAX_THREADS} + $,CUSTOM_DATA_SOURCE,>) + +target_link_libraries( + omnitrace-interface-library + INTERFACE $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $,omnitrace::omnitrace-sanitizer,>) + +# ------------------------------------------------------------------------------# +# +# omnitrace object library +# +# ------------------------------------------------------------------------------# + +add_library(omnitrace-object-library OBJECT) +add_library(omnitrace::omnitrace-object-library ALIAS omnitrace-object-library) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/library/defines.hpp.in + ${CMAKE_CURRENT_BINARY_DIR}/include/library/defines.hpp @ONLY) + +set(library_sources + ${CMAKE_CURRENT_LIST_DIR}/src/library.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/api.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/config.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/cpu_freq.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/critical_trace.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/kokkosp.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/gpu.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/ompt.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/perfetto.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/ptl.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/sampling.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/state.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/thread_data.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/thread_sampler.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/timemory.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/backtrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/fork_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/mpi_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/omnitrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/pthread_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/user_region.cpp + ${perfetto_DIR}/sdk/perfetto.cc) + +set(library_headers + ${CMAKE_CURRENT_LIST_DIR}/include/library.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/api.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/config.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/common.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/cpu_freq.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/critical_trace.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/debug.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/gpu.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/ompt.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/perfetto.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/ptl.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/sampling.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/state.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/thread_data.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/thread_sampler.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/timemory.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fwd.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/backtrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/fork_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/functors.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/mpi_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/omnitrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/rocm_smi.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/roctracer_callbacks.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/pthread_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/include/library/components/user_region.hpp + ${perfetto_DIR}/sdk/perfetto.h) + +target_sources(omnitrace-object-library PRIVATE ${library_sources} ${library_headers}) + +if(OMNITRACE_USE_ROCTRACER) + target_sources( + omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/library/components/roctracer_callbacks.cpp) +endif() + +if(OMNITRACE_USE_ROCM_SMI) + target_sources(omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src/library/components/rocm_smi.cpp) +endif() + +target_link_libraries(omnitrace-object-library PRIVATE omnitrace-interface-library) + +if(OMNITRACE_DYNINST_API_RT) + get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}" + DIRECTORY) +endif() + +# ------------------------------------------------------------------------------# +# +# omnitrace shared library +# +# ------------------------------------------------------------------------------# + +add_library(omnitrace-library SHARED $) +add_library(omnitrace::omnitrace-library ALIAS omnitrace-library) + +target_link_libraries(omnitrace-library PRIVATE omnitrace-interface-library) + +set_target_properties( + omnitrace-library + PROPERTIES OUTPUT_NAME omnitrace + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_VERSION_MAJOR} + INSTALL_RPATH + "\$ORIGIN:\$ORIGIN/timemory/libunwind:\$ORIGIN/dyninst-tpls/libs") + +install( + TARGETS omnitrace-library + DESTINATION ${CMAKE_INSTALL_LIBDIR} + OPTIONAL) diff --git a/source/lib/include/library.hpp b/source/lib/omnitrace/include/library.hpp similarity index 100% rename from source/lib/include/library.hpp rename to source/lib/omnitrace/include/library.hpp diff --git a/source/lib/include/library/api.hpp b/source/lib/omnitrace/include/library/api.hpp similarity index 87% rename from source/lib/include/library/api.hpp rename to source/lib/omnitrace/include/library/api.hpp index 56e7bd35c4..8444d23ef4 100644 --- a/source/lib/include/library/api.hpp +++ b/source/lib/omnitrace/include/library/api.hpp @@ -51,6 +51,12 @@ extern "C" /// stops an instrumentation region void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API; + /// starts an instrumentation region (user-defined) + void omnitrace_push_region(const char* name) OMNITRACE_PUBLIC_API; + + /// stops an instrumentation region (user-defined) + void omnitrace_pop_region(const char* name) OMNITRACE_PUBLIC_API; + // these are the real implementations for internal calling convention void omnitrace_init_library_hidden(void) OMNITRACE_HIDDEN_API; void omnitrace_init_hidden(const char*, bool, const char*) OMNITRACE_HIDDEN_API; @@ -60,4 +66,6 @@ extern "C" void omnitrace_set_mpi_hidden(bool use, bool attached) OMNITRACE_HIDDEN_API; void omnitrace_push_trace_hidden(const char* name) OMNITRACE_HIDDEN_API; void omnitrace_pop_trace_hidden(const char* name) OMNITRACE_HIDDEN_API; + void omnitrace_push_region_hidden(const char* name) OMNITRACE_HIDDEN_API; + void omnitrace_pop_region_hidden(const char* name) OMNITRACE_HIDDEN_API; } diff --git a/source/lib/include/library/common.hpp b/source/lib/omnitrace/include/library/common.hpp similarity index 100% rename from source/lib/include/library/common.hpp rename to source/lib/omnitrace/include/library/common.hpp diff --git a/source/lib/include/library/components/backtrace.hpp b/source/lib/omnitrace/include/library/components/backtrace.hpp similarity index 100% rename from source/lib/include/library/components/backtrace.hpp rename to source/lib/omnitrace/include/library/components/backtrace.hpp diff --git a/source/lib/include/library/components/fork_gotcha.hpp b/source/lib/omnitrace/include/library/components/fork_gotcha.hpp similarity index 100% rename from source/lib/include/library/components/fork_gotcha.hpp rename to source/lib/omnitrace/include/library/components/fork_gotcha.hpp diff --git a/source/lib/include/library/components/functors.hpp b/source/lib/omnitrace/include/library/components/functors.hpp similarity index 100% rename from source/lib/include/library/components/functors.hpp rename to source/lib/omnitrace/include/library/components/functors.hpp diff --git a/source/lib/include/library/components/fwd.hpp b/source/lib/omnitrace/include/library/components/fwd.hpp similarity index 94% rename from source/lib/include/library/components/fwd.hpp rename to source/lib/omnitrace/include/library/components/fwd.hpp index 82fa9879bc..bc3c341496 100644 --- a/source/lib/include/library/components/fwd.hpp +++ b/source/lib/omnitrace/include/library/components/fwd.hpp @@ -68,6 +68,7 @@ using functor_t = std::function; using default_functor_t = functor_t; struct omnitrace; +struct user_region; struct backtrace; struct backtrace_wall_clock {}; @@ -126,6 +127,8 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_gpu_ TIMEMORY_SET_COMPONENT_API(omnitrace::component::omnitrace, project::omnitrace, category::dynamic_instrumentation, os::supports_linux) +TIMEMORY_SET_COMPONENT_API(omnitrace::component::user_region, project::omnitrace, + os::supports_linux) TIMEMORY_SET_COMPONENT_API(omnitrace::component::roctracer, project::omnitrace, tpls::rocm, device::gpu, os::supports_linux, category::external) @@ -154,6 +157,8 @@ TIMEMORY_SET_COMPONENT_API(omnitrace::component::sampling_gpu_temp, project::omn TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::omnitrace, OMNITRACE_COMPONENT, "omnitrace", "omnitrace_component") +TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::user_region, OMNITRACE_USER_REGION, + "user_region", "omnitrace_user_region") TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::roctracer, OMNITRACE_ROCTRACER, "roctracer", "omnitrace_roctracer") TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_wall_clock, @@ -173,6 +178,15 @@ TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_gpu_power, TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_gpu_temp, OMNITRACE_SAMPLING_GPU_TEMP, "sampling_gpu_temp", "") +TIMEMORY_METADATA_SPECIALIZATION( + omnitrace::component::omnitrace, "omnitrace", + "Invokes instrumentation functions 'omnitrace_push_trace' and 'omnitrace_pop_trace'", + "Used by gotcha wrappers") +TIMEMORY_METADATA_SPECIALIZATION( + omnitrace::component::user_region, "user_region", + "Invokes instrumentation functions 'omnitrace_user_push_region' and " + "'omnitrace_user_pop_region'", + "Used by OMPT") TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::roctracer, "roctracer", "High-precision ROCm API and kernel tracing", "") TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::sampling_wall_clock, diff --git a/source/lib/include/library/components/mpi_gotcha.hpp b/source/lib/omnitrace/include/library/components/mpi_gotcha.hpp similarity index 100% rename from source/lib/include/library/components/mpi_gotcha.hpp rename to source/lib/omnitrace/include/library/components/mpi_gotcha.hpp diff --git a/source/lib/include/library/components/omnitrace.hpp b/source/lib/omnitrace/include/library/components/omnitrace.hpp similarity index 89% rename from source/lib/include/library/components/omnitrace.hpp rename to source/lib/omnitrace/include/library/components/omnitrace.hpp index 3adfaaefd8..abc3885e2a 100644 --- a/source/lib/include/library/components/omnitrace.hpp +++ b/source/lib/omnitrace/include/library/components/omnitrace.hpp @@ -43,8 +43,3 @@ private: }; } // namespace component } // namespace omnitrace - -TIMEMORY_METADATA_SPECIALIZATION( - omnitrace::component::omnitrace, "omnitrace", - "Invokes instrumentation functions 'omnitrace_push_trace' and 'omnitrace_pop_trace'", - "Used by gotcha wrappers") diff --git a/source/lib/include/library/components/pthread_gotcha.hpp b/source/lib/omnitrace/include/library/components/pthread_gotcha.hpp similarity index 100% rename from source/lib/include/library/components/pthread_gotcha.hpp rename to source/lib/omnitrace/include/library/components/pthread_gotcha.hpp diff --git a/source/lib/include/library/components/rocm_smi.hpp b/source/lib/omnitrace/include/library/components/rocm_smi.hpp similarity index 100% rename from source/lib/include/library/components/rocm_smi.hpp rename to source/lib/omnitrace/include/library/components/rocm_smi.hpp diff --git a/source/lib/include/library/components/roctracer.hpp b/source/lib/omnitrace/include/library/components/roctracer.hpp similarity index 100% rename from source/lib/include/library/components/roctracer.hpp rename to source/lib/omnitrace/include/library/components/roctracer.hpp diff --git a/source/lib/include/library/components/roctracer_callbacks.hpp b/source/lib/omnitrace/include/library/components/roctracer_callbacks.hpp similarity index 100% rename from source/lib/include/library/components/roctracer_callbacks.hpp rename to source/lib/omnitrace/include/library/components/roctracer_callbacks.hpp diff --git a/source/lib/omnitrace/include/library/components/user_region.hpp b/source/lib/omnitrace/include/library/components/user_region.hpp new file mode 100644 index 0000000000..d18894d77f --- /dev/null +++ b/source/lib/omnitrace/include/library/components/user_region.hpp @@ -0,0 +1,56 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/defines.hpp" +#include "library/timemory.hpp" + +namespace omnitrace +{ +namespace component +{ +// timemory component which calls omnitrace functions +// (used in gotcha wrappers) +struct user_region : comp::base +{ + static std::string label() { return "user_region"; } + void start(); + void stop(); + void set_prefix(const char*); + +private: + const char* m_prefix = nullptr; +}; +} // namespace component +} // namespace omnitrace + +TIMEMORY_COMPONENT_ALIAS(omnitrace_user_region, omnitrace::component::user_region) + +#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ + (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + +# include + +TIMEMORY_DECLARE_EXTERN_COMPONENT(omnitrace_user_region, false, void) + +#endif diff --git a/source/lib/include/library/config.hpp b/source/lib/omnitrace/include/library/config.hpp similarity index 98% rename from source/lib/include/library/config.hpp rename to source/lib/omnitrace/include/library/config.hpp index e764301d3b..48b329d16f 100644 --- a/source/lib/include/library/config.hpp +++ b/source/lib/omnitrace/include/library/config.hpp @@ -171,6 +171,9 @@ get_use_critical_trace(); bool get_use_kokkosp(); +bool +get_use_ompt(); + bool get_timeline_sampling(); @@ -245,6 +248,9 @@ get_critical_trace_per_row(); State& get_state(); +/// returns old state +State set_state(State); + std::unique_ptr& get_main_bundle(); diff --git a/source/lib/include/library/cpu_freq.hpp b/source/lib/omnitrace/include/library/cpu_freq.hpp similarity index 100% rename from source/lib/include/library/cpu_freq.hpp rename to source/lib/omnitrace/include/library/cpu_freq.hpp diff --git a/source/lib/include/library/critical_trace.hpp b/source/lib/omnitrace/include/library/critical_trace.hpp similarity index 100% rename from source/lib/include/library/critical_trace.hpp rename to source/lib/omnitrace/include/library/critical_trace.hpp diff --git a/source/lib/include/library/debug.hpp b/source/lib/omnitrace/include/library/debug.hpp similarity index 79% rename from source/lib/include/library/debug.hpp rename to source/lib/omnitrace/include/library/debug.hpp index 820b030fbd..480120c06c 100644 --- a/source/lib/include/library/debug.hpp +++ b/source/lib/omnitrace/include/library/debug.hpp @@ -30,6 +30,9 @@ #include #include +#include +#include +#include namespace omnitrace { @@ -47,8 +50,32 @@ get_debug_pid(); bool get_critical_trace_debug(); } // namespace config + +namespace debug +{ +namespace +{ +template +auto get_chars(std::index_sequence) +{ + static const char _v[sizeof...(Idx) + 1] = { T::get()[Idx]..., '\0' }; + return _v; +} + +template +auto +get_chars() +{ + return get_chars(typename T::sequence{}); +} +} // namespace +} // namespace debug } // namespace omnitrace +#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y +#define OMNITRACE_LINESTR TIMEMORY_STRINGIZE(__LINE__) +#define OMNITRACE_VARIABLE(LABEL) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, LABEL) + #if defined(TIMEMORY_USE_MPI) # define OMNITRACE_PROCESS_IDENTIFIER static_cast(::tim::dmp::rank()) #elif defined(TIMEMORY_USE_MPI_HEADERS) @@ -59,6 +86,26 @@ get_critical_trace_debug(); # define OMNITRACE_PROCESS_IDENTIFIER static_cast(::tim::process::get_id()) #endif +#define OMNITRACE_FUNCTION \ + std::string{ __FUNCTION__ } \ + .substr(0, std::string_view{ __FUNCTION__ }.find("_hidden")) \ + .c_str() + +#define OMNITRACE_CT_FUNCTION(VAR, STR) \ + static constexpr auto OMNITRACE_VARIABLE(__LINE__) = std::string_view{ STR }; \ + VAR = OMNITRACE_CT_FUNCTION_IMPL(OMNITRACE_VARIABLE(__LINE__)); + +#define OMNITRACE_CT_FUNCTION_IMPL(STR) \ + []() { \ + struct wrapper \ + { \ + static constexpr const char* get() { return STR.data(); } \ + using sequence = \ + std::make_index_sequence; \ + }; \ + return ::omnitrace::debug::get_chars(); \ + }(); + #define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ if((COND) && ::omnitrace::config::get_debug_tid() && \ ::omnitrace::config::get_debug_pid()) \ @@ -89,7 +136,7 @@ get_critical_trace_debug(); fflush(stderr); \ tim::auto_lock_t _lk{ tim::type_mutex() }; \ fprintf(stderr, "[omnitrace][%i][%li][%s] ", OMNITRACE_PROCESS_IDENTIFIER, \ - tim::threading::get_id(), __FUNCTION__); \ + tim::threading::get_id(), OMNITRACE_FUNCTION); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } @@ -100,7 +147,7 @@ get_critical_trace_debug(); { \ fflush(stderr); \ tim::auto_lock_t _lk{ tim::type_mutex() }; \ - fprintf(stderr, "[omnitrace][%s] ", __FUNCTION__); \ + fprintf(stderr, "[omnitrace][%s] ", OMNITRACE_FUNCTION); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } @@ -110,7 +157,8 @@ get_critical_trace_debug(); { \ char _msg_buffer[2048]; \ snprintf(_msg_buffer, 2048, "[omnitrace][%i][%li][%s] ", \ - OMNITRACE_PROCESS_IDENTIFIER, tim::threading::get_id(), __FUNCTION__); \ + OMNITRACE_PROCESS_IDENTIFIER, tim::threading::get_id(), \ + OMNITRACE_FUNCTION); \ auto len = strlen(_msg_buffer); \ snprintf(_msg_buffer + len, 2048 - len, __VA_ARGS__); \ throw std::runtime_error(_msg_buffer); \ @@ -120,12 +168,20 @@ get_critical_trace_debug(); if(COND) \ { \ char _msg_buffer[2048]; \ - snprintf(_msg_buffer, 2048, "[omnitrace][%s] ", __FUNCTION__); \ + snprintf(_msg_buffer, 2048, "[omnitrace][%s] ", OMNITRACE_FUNCTION); \ auto len = strlen(_msg_buffer); \ snprintf(_msg_buffer + len, 2048 - len, __VA_ARGS__); \ throw std::runtime_error(_msg_buffer); \ } +#define OMNITRACE_CI_THROW(COND, ...) \ + OMNITRACE_CONDITIONAL_THROW(::omnitrace::get_is_continuous_integration() && (COND), \ + __VA_ARGS__) + +#define OMNITRACE_CI_BASIC_THROW(COND, ...) \ + OMNITRACE_CONDITIONAL_BASIC_THROW( \ + ::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__) + #define OMNITRACE_STRINGIZE(...) #__VA_ARGS__ #define OMNITRACE_ESC(...) __VA_ARGS__ diff --git a/source/lib/include/library/defines.hpp.in b/source/lib/omnitrace/include/library/defines.hpp.in similarity index 85% rename from source/lib/include/library/defines.hpp.in rename to source/lib/omnitrace/include/library/defines.hpp.in index 603c13061c..bd636fcf25 100644 --- a/source/lib/include/library/defines.hpp.in +++ b/source/lib/omnitrace/include/library/defines.hpp.in @@ -36,12 +36,14 @@ // clang-format on #define TIMEMORY_USER_COMPONENT_ENUM \ - OMNITRACE_COMPONENT_idx, OMNITRACE_ROCTRACER_idx, OMNITRACE_SAMPLING_WALL_CLOCK_idx, \ - OMNITRACE_SAMPLING_CPU_CLOCK_idx, OMNITRACE_SAMPLING_PERCENT_idx, \ - OMNITRACE_SAMPLING_GPU_POWER_idx, OMNITRACE_SAMPLING_GPU_TEMP_idx, \ - OMNITRACE_SAMPLING_GPU_BUSY_idx, OMNITRACE_SAMPLING_GPU_MEMORY_USAGE_idx, + OMNITRACE_COMPONENT_idx, OMNITRACE_USER_REGION_idx, OMNITRACE_ROCTRACER_idx, \ + OMNITRACE_SAMPLING_WALL_CLOCK_idx, OMNITRACE_SAMPLING_CPU_CLOCK_idx, \ + OMNITRACE_SAMPLING_PERCENT_idx, OMNITRACE_SAMPLING_GPU_POWER_idx, \ + OMNITRACE_SAMPLING_GPU_TEMP_idx, OMNITRACE_SAMPLING_GPU_BUSY_idx, \ + OMNITRACE_SAMPLING_GPU_MEMORY_USAGE_idx, #define OMNITRACE_COMPONENT OMNITRACE_COMPONENT_idx +#define OMNITRACE_USER_REGION OMNITRACE_USER_REGION_idx #define OMNITRACE_ROCTRACER OMNITRACE_ROCTRACER_idx #define OMNITRACE_SAMPLING_WALL_CLOCK OMNITRACE_SAMPLING_WALL_CLOCK_idx #define OMNITRACE_SAMPLING_CPU_CLOCK OMNITRACE_SAMPLING_CPU_CLOCK_idx diff --git a/source/lib/include/library/dynamic_library.hpp b/source/lib/omnitrace/include/library/dynamic_library.hpp similarity index 100% rename from source/lib/include/library/dynamic_library.hpp rename to source/lib/omnitrace/include/library/dynamic_library.hpp diff --git a/source/lib/include/library/gpu.hpp b/source/lib/omnitrace/include/library/gpu.hpp similarity index 100% rename from source/lib/include/library/gpu.hpp rename to source/lib/omnitrace/include/library/gpu.hpp diff --git a/source/lib/omnitrace/include/library/ompt.hpp b/source/lib/omnitrace/include/library/ompt.hpp new file mode 100644 index 0000000000..32b8c52528 --- /dev/null +++ b/source/lib/omnitrace/include/library/ompt.hpp @@ -0,0 +1,35 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +namespace omnitrace +{ +namespace ompt +{ +void +setup(); + +void +shutdown(); +} // namespace ompt +} // namespace omnitrace diff --git a/source/lib/include/library/perfetto.hpp b/source/lib/omnitrace/include/library/perfetto.hpp similarity index 97% rename from source/lib/include/library/perfetto.hpp rename to source/lib/omnitrace/include/library/perfetto.hpp index fde493299d..6a5939160d 100644 --- a/source/lib/include/library/perfetto.hpp +++ b/source/lib/omnitrace/include/library/perfetto.hpp @@ -33,6 +33,7 @@ # define PERFETTO_CATEGORIES \ perfetto::Category("host").SetDescription("Host-side function tracing"), \ perfetto::Category("device").SetDescription("Device-side function tracing"), \ + perfetto::Category("user").SetDescription("User-defined regions"), \ perfetto::Category("rocm_smi").SetDescription("Device-level metrics"), \ perfetto::Category("sampling") \ .SetDescription("Metrics derived from sampling"), \ @@ -46,6 +47,7 @@ # define PERFETTO_CATEGORIES \ perfetto::Category("host").SetDescription("Host-side function tracing"), \ perfetto::Category("device").SetDescription("Device-side function tracing"), \ + perfetto::Category("user").SetDescription("User-defined regions"), \ perfetto::Category("rocm_smi").SetDescription("Device-level metrics"), \ perfetto::Category("sampling") \ .SetDescription("Metrics derived from sampling"), \ diff --git a/source/lib/include/library/ptl.hpp b/source/lib/omnitrace/include/library/ptl.hpp similarity index 100% rename from source/lib/include/library/ptl.hpp rename to source/lib/omnitrace/include/library/ptl.hpp diff --git a/source/lib/include/library/redirect.hpp b/source/lib/omnitrace/include/library/redirect.hpp similarity index 100% rename from source/lib/include/library/redirect.hpp rename to source/lib/omnitrace/include/library/redirect.hpp diff --git a/source/lib/include/library/sampling.hpp b/source/lib/omnitrace/include/library/sampling.hpp similarity index 100% rename from source/lib/include/library/sampling.hpp rename to source/lib/omnitrace/include/library/sampling.hpp diff --git a/source/lib/include/library/state.hpp b/source/lib/omnitrace/include/library/state.hpp similarity index 100% rename from source/lib/include/library/state.hpp rename to source/lib/omnitrace/include/library/state.hpp diff --git a/source/lib/include/library/thread_data.hpp b/source/lib/omnitrace/include/library/thread_data.hpp similarity index 100% rename from source/lib/include/library/thread_data.hpp rename to source/lib/omnitrace/include/library/thread_data.hpp diff --git a/source/lib/include/library/thread_sampler.hpp b/source/lib/omnitrace/include/library/thread_sampler.hpp similarity index 100% rename from source/lib/include/library/thread_sampler.hpp rename to source/lib/omnitrace/include/library/thread_sampler.hpp diff --git a/source/lib/include/library/timemory.hpp b/source/lib/omnitrace/include/library/timemory.hpp similarity index 100% rename from source/lib/include/library/timemory.hpp rename to source/lib/omnitrace/include/library/timemory.hpp diff --git a/source/lib/src/library.cpp b/source/lib/omnitrace/src/library.cpp similarity index 81% rename from source/lib/src/library.cpp rename to source/lib/omnitrace/src/library.cpp index fcf1fa332b..5eb11106a5 100644 --- a/source/lib/src/library.cpp +++ b/source/lib/omnitrace/src/library.cpp @@ -32,6 +32,7 @@ #include "library/debug.hpp" #include "library/defines.hpp" #include "library/gpu.hpp" +#include "library/ompt.hpp" #include "library/ptl.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -46,10 +47,22 @@ using namespace omnitrace; //======================================================================================// -using library_functors = omnitrace::component::functors; +namespace +{ +struct omni_regions +{}; +struct user_regions +{}; +} // namespace -TIMEMORY_DEFINE_NAME_TRAIT("cpu_instrumentation", library_functors); -TIMEMORY_INVOKE_PREINIT(library_functors) +using omni_functors = omnitrace::component::functors; +using user_functors = omnitrace::component::functors; + +// TIMEMORY_DEFINE_NAME_TRAIT("host", omni_functors); +// TIMEMORY_DEFINE_NAME_TRAIT("user", user_functors); + +TIMEMORY_INVOKE_PREINIT(omni_functors) +TIMEMORY_INVOKE_PREINIT(user_functors) //======================================================================================// @@ -159,6 +172,9 @@ pop_count() } auto _debug_push = tim::get_env("OMNITRACE_DEBUG_PUSH", false) && !get_debug_env(); +auto _debug_pop = tim::get_env("OMNITRACE_DEBUG_POP", false) && !get_debug_env(); +auto _debug_user = + tim::get_env("OMNITRACE_DEBUG_USER_REGIONS", false) && !get_debug_env(); } // namespace //======================================================================================// @@ -172,11 +188,11 @@ omnitrace_push_trace_hidden(const char* name) if(get_state() == State::Finalized) { OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug_push, "%s(%s) called during finalization\n", __FUNCTION__, name); + _debug_push, "omnitrace_push_trace(%s) called during finalization\n", name); return; } - OMNITRACE_CONDITIONAL_BASIC_PRINT(_debug_push, "%s(%s)\n", __FUNCTION__, name); + OMNITRACE_CONDITIONAL_BASIC_PRINT(_debug_push, "omnitrace_push_trace(%s)\n", name); // the expectation here is that if the state is not active then the call // to omnitrace_init_tooling_hidden will activate all the appropriate @@ -184,12 +200,13 @@ omnitrace_push_trace_hidden(const char* name) if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) { static auto _debug = get_debug_env() || get_debug_init(); - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug, "%s :: not active. state = %s\n", - name, std::to_string(get_state()).c_str()); + OMNITRACE_CONDITIONAL_BASIC_PRINT( + _debug, "omnitrace_push_trace(%s) ignored :: not active. state = %s\n", name, + std::to_string(get_state()).c_str()); return; } - OMNITRACE_DEBUG_F("%s\n", name); + OMNITRACE_DEBUG("omnitrace_push_trace(%s)\n", name); static auto _sample_rate = std::max(get_instrumentation_interval(), 1); static thread_local size_t _sample_idx = 0; @@ -197,7 +214,7 @@ omnitrace_push_trace_hidden(const char* name) auto _enabled = (_sample_idx++ % _sample_rate == 0); _interval->emplace_back(_enabled); - if(_enabled) library_functors::start(name); + if(_enabled) omni_functors::start(name); if(get_use_critical_trace()) { uint64_t _cid = 0; @@ -222,15 +239,17 @@ omnitrace_pop_trace_hidden(const char* name) { ++pop_count(); + OMNITRACE_CONDITIONAL_BASIC_PRINT(_debug_pop, "omnitrace_pop_trace(%s)\n", name); + // only execute when active if(get_state() == State::Active) { - OMNITRACE_DEBUG_F("%s\n", name); + OMNITRACE_DEBUG("omnitrace_pop_trace(%s)\n", name); auto& _interval_data = get_interval_data(); if(!_interval_data->empty()) { - if(_interval_data->back()) library_functors::stop(name); + if(_interval_data->back()) omni_functors::stop(name); _interval_data->pop_back(); } @@ -255,9 +274,64 @@ omnitrace_pop_trace_hidden(const char* name) else { static auto _debug = get_debug_env(); - OMNITRACE_CONDITIONAL_BASIC_PRINT(_debug, "[%s] %s ignored :: state = %s\n", - __FUNCTION__, name, - std::to_string(get_state()).c_str()); + OMNITRACE_CONDITIONAL_BASIC_PRINT( + _debug, "omnitrace_pop_trace(%s) ignored :: state = %s\n", name, + std::to_string(get_state()).c_str()); + } +} + +//======================================================================================// +/// +/// +/// +//======================================================================================// + +extern "C" void +omnitrace_push_region_hidden(const char* name) +{ + // unconditionally return if finalized + if(get_state() == State::Finalized) + { + OMNITRACE_CONDITIONAL_BASIC_PRINT( + _debug_user, "omnitrace_push_region(%s) called during finalization\n", name); + return; + } + + OMNITRACE_CONDITIONAL_BASIC_PRINT(_debug_push, "omnitrace_push_region(%s)\n", name); + + // the expectation here is that if the state is not active then the call + // to omnitrace_init_tooling_hidden will activate all the appropriate + // tooling one time and as it exits set it to active and return true. + if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) + { + static auto _debug = get_debug_env() || get_debug_init(); + OMNITRACE_CONDITIONAL_BASIC_PRINT( + _debug, "omnitrace_push_region(%s) ignored :: not active. state = %s\n", name, + std::to_string(get_state()).c_str()); + return; + } + + OMNITRACE_DEBUG("omnitrace_push_region(%s)\n", name); + user_functors::start(name); +} + +//======================================================================================// + +extern "C" void +omnitrace_pop_region_hidden(const char* name) +{ + // only execute when active + if(get_state() == State::Active) + { + OMNITRACE_DEBUG("omnitrace_pop_region(%s)\n", name); + user_functors::stop(name); + } + else + { + static auto _debug = get_debug_env(); + OMNITRACE_CONDITIONAL_BASIC_PRINT( + _debug, "omnitrace_pop_region(%s) ignored :: state = %s\n", name, + std::to_string(get_state()).c_str()); } } @@ -271,7 +345,7 @@ extern "C" void omnitrace_set_env_hidden(const char* env_name, const char* env_val) { // just search env to avoid initializing the settings - OMNITRACE_CONDITIONAL_PRINT(get_debug_env() || get_verbose_env() > 2, + OMNITRACE_CONDITIONAL_PRINT(get_debug_init() || get_verbose_env() > 2, "[%s] Setting env: %s=%s\n", __FUNCTION__, env_name, env_val); @@ -279,9 +353,10 @@ omnitrace_set_env_hidden(const char* env_name, const char* env_val) OMNITRACE_CONDITIONAL_THROW( get_state() >= State::Init && - (config::get_is_continuous_integration() || get_debug_env()), - "%s(\"%s\", \"%s\") called after omnitrace was initialized. state = %s", - __FUNCTION__, env_name, env_val, std::to_string(get_state()).c_str()); + (config::get_is_continuous_integration() || get_debug_init()), + "omnitrace_set_env(\"%s\", \"%s\") called after omnitrace was initialized. state " + "= %s", + env_name, env_val, std::to_string(get_state()).c_str()); } //======================================================================================// @@ -300,7 +375,7 @@ extern "C" void omnitrace_set_mpi_hidden(bool use, bool attached) { // just search env to avoid initializing the settings - OMNITRACE_CONDITIONAL_PRINT(get_debug_env() || get_verbose_env() > 2, + OMNITRACE_CONDITIONAL_PRINT(get_debug_init() || get_verbose_env() > 2, "[%s] use: %s, attached: %s\n", __FUNCTION__, (use) ? "y" : "n", (attached) ? "y" : "n"); @@ -319,9 +394,10 @@ omnitrace_set_mpi_hidden(bool use, bool attached) OMNITRACE_CONDITIONAL_THROW( get_state() >= State::Init && - (config::get_is_continuous_integration() || get_debug_env()), - "%s(use=%s, attached=%s) called after omnitrace was initialized. state = %s", - __FUNCTION__, std::to_string(use).c_str(), std::to_string(attached).c_str(), + (config::get_is_continuous_integration() || get_debug_init()), + "omnitrace_set_mpi(use=%s, attached=%s) called after omnitrace was initialized. " + "state = %s", + std::to_string(use).c_str(), std::to_string(attached).c_str(), std::to_string(get_state()).c_str()); _start_gotcha_callback(); @@ -335,23 +411,37 @@ omnitrace_init_library_hidden() auto _tid = threading::get_id(); (void) _tid; - auto _mode = get_mode(); - auto _debug_init = get_debug_init(); + static bool _once = false; + auto _mode = get_mode(); + auto _debug_init = get_debug_init(); + + OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s...\n", + std::to_string(get_state()).c_str()); + + OMNITRACE_CI_THROW(get_state() != State::PreInit, "State is not PreInit :: %s", + std::to_string(get_state()).c_str()); + + if(get_state() != State::PreInit || get_state() == State::Init || _once) return; + _once = true; OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s. Setting to %s...\n", std::to_string(get_state()).c_str(), std::to_string(State::Init).c_str()); + OMNITRACE_CONDITIONAL_BASIC_PRINT_F( + _debug_init, "Calling backtrace once so that the one-time call of malloc in " + "glibc's backtrace() occurs...\n"); { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F( - _debug_init, "Calling backtrace once so that the one-time call of malloc in " - "glibc's backtrace() occurs...\n"); std::stringstream _ss{}; tim::print_backtrace<64>(_ss); (void) _ss; } - get_state() = State::Init; + set_state(State::Init); + + OMNITRACE_CI_THROW(get_state() != State::Init, + "set_state(State::Init) failed. state is %s", + std::to_string(get_state()).c_str()); OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Configuring settings...\n"); @@ -395,8 +485,9 @@ omnitrace_init_library_hidden() if(get_use_kokkosp()) { - auto _force = 0; - if(tim::get_env("KOKKOS_PROFILE_LIBRARY") == "libtimemory.so") + auto _force = 0; + auto _current_kokkosp_lib = tim::get_env("KOKKOS_PROFILE_LIBRARY"); + if(std::regex_search(_current_kokkosp_lib, std::regex{ "libtimemory\\." })) _force = 1; tim::set_env("KOKKOS_PROFILE_LIBRARY", "libomnitrace.so", _force); } @@ -537,8 +628,12 @@ omnitrace_init_tooling_hidden() return true; }(); static thread_local auto _dtor = scope::destructor{ []() { - if(get_use_sampling()) sampling::shutdown(); - thread_data::instance()->stop(); + if(get_state() != State::Finalized) + { + if(get_use_sampling()) sampling::shutdown(); + if(thread_data::instance()) + thread_data::instance()->stop(); + } } }; (void) _thread_setup; (void) _dtor; @@ -555,7 +650,7 @@ omnitrace_init_tooling_hidden() (void) _v; }; - // functors for starting and stopping timemory + // functors for starting and stopping timemory omni functors auto _push_timemory = [](const char* name) { auto& _data = get_instrumentation_bundles(); // this generates a hash for the raw string array @@ -566,11 +661,6 @@ omnitrace_init_tooling_hidden() _bundle->start(); }; - auto _push_perfetto = [](const char* name) { - uint64_t _ts = comp::wall_clock::record(); - TRACE_EVENT_BEGIN("host", perfetto::StaticString(name), _ts); - }; - auto _pop_timemory = [](const char* name) { auto _hash = tim::hash::get_hash_id(tim::string_view_t{ name }); auto& _data = get_instrumentation_bundles(); @@ -593,14 +683,33 @@ omnitrace_init_tooling_hidden() } }; + // functors for starting and stopping perfetto omni functors + auto _push_perfetto = [](const char* name) { + uint64_t _ts = comp::wall_clock::record(); + TRACE_EVENT_BEGIN("host", perfetto::StaticString(name), _ts); + }; + auto _pop_perfetto = [](const char*) { uint64_t _ts = comp::wall_clock::record(); TRACE_EVENT_END("host", _ts); }; + // functors for starting and stopping perfetto user functors + auto _push_user_perfetto = [](const char* name) { + uint64_t _ts = comp::wall_clock::record(); + TRACE_EVENT_BEGIN("user", nullptr, _ts, [name](perfetto::EventContext& _ctx) { + _ctx.event()->set_name(name); + }); + }; + + auto _pop_user_perfetto = [](const char*) { + uint64_t _ts = comp::wall_clock::record(); + TRACE_EVENT_END("user", _ts); + }; + if(get_use_perfetto() && get_use_timemory()) { - library_functors::configure( + omni_functors::configure( [=](const char* name) { _thread_init(); _push_perfetto(name); @@ -611,20 +720,44 @@ omnitrace_init_tooling_hidden() _pop_timemory(name); _pop_perfetto(name); }); + user_functors::configure( + [=](const char* name) { + _thread_init(); + _push_user_perfetto(name); + _push_timemory(name); + }, + [=](const char* name) { + _pop_timemory(name); + _pop_user_perfetto(name); + }); } else if(get_use_perfetto()) { - library_functors::configure( + omni_functors::configure( [=](const char* name) { _thread_init(); _push_perfetto(name); _setup_thread_sampling(); }, [=](const char* name) { _pop_perfetto(name); }); + user_functors::configure( + [=](const char* name) { + _thread_init(); + _push_user_perfetto(name); + _setup_thread_sampling(); + }, + [=](const char* name) { _pop_user_perfetto(name); }); } else if(get_use_timemory()) { - library_functors::configure( + omni_functors::configure( + [=](const char* name) { + _thread_init(); + _push_timemory(name); + _setup_thread_sampling(); + }, + [=](const char* name) { _pop_timemory(name); }); + user_functors::configure( [=](const char* name) { _thread_init(); _push_timemory(name); @@ -633,6 +766,8 @@ omnitrace_init_tooling_hidden() [=](const char* name) { _pop_timemory(name); }); } + ompt::setup(); + if(get_use_perfetto() && !is_system_backend()) { #if defined(CUSTOM_DATA_SOURCE) @@ -676,9 +811,16 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a (void) pop_count(); get_finalization_functions().emplace_back([_argv0]() { - // if main hasn't been popped yet, pop it - OMNITRACE_BASIC_PRINT("Running omnitrace_pop_trace(%s)...\n", _argv0); - omnitrace_pop_trace_hidden(_argv0); + OMNITRACE_CI_THROW(get_state() != State::Active, + "Finalizer function for popping main invoked in non-active " + "state :: state = %s\n", + std::to_string(get_state()).c_str()); + if(get_state() == State::Active) + { + // if main hasn't been popped yet, pop it + OMNITRACE_BASIC_VERBOSE(2, "Running omnitrace_pop_trace(%s)...\n", _argv0); + omnitrace_pop_trace_hidden(_argv0); + } }); std::atexit([]() { @@ -740,9 +882,10 @@ omnitrace_finalize_hidden(void) } } - get_state() = State::Finalized; + set_state(State::Finalized); - library_functors::configure([](const char*) {}, [](const char*) {}); + omni_functors::configure([](const char*) {}, [](const char*) {}); + user_functors::configure([](const char*) {}, [](const char*) {}); pthread_gotcha::push_enable_sampling_on_child_threads(false); pthread_gotcha::set_sampling_on_all_future_threads(false); @@ -785,6 +928,8 @@ omnitrace_finalize_hidden(void) } } + ompt::shutdown(); + OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); for(auto& itr : instrumentation_bundles::instances()) { diff --git a/source/lib/src/library/api.cpp b/source/lib/omnitrace/src/library/api.cpp similarity index 90% rename from source/lib/src/library/api.cpp rename to source/lib/omnitrace/src/library/api.cpp index cc369caf88..7d8aa32fab 100644 --- a/source/lib/src/library/api.cpp +++ b/source/lib/omnitrace/src/library/api.cpp @@ -34,6 +34,18 @@ omnitrace_pop_trace(const char* _name) omnitrace_pop_trace_hidden(_name); } +extern "C" void +omnitrace_push_region(const char* _name) +{ + omnitrace_push_region_hidden(_name); +} + +extern "C" void +omnitrace_pop_region(const char* _name) +{ + omnitrace_pop_region_hidden(_name); +} + extern "C" void omnitrace_init_library(void) { diff --git a/source/lib/src/library/components/backtrace.cpp b/source/lib/omnitrace/src/library/components/backtrace.cpp similarity index 99% rename from source/lib/src/library/components/backtrace.cpp rename to source/lib/omnitrace/src/library/components/backtrace.cpp index 81d699bc3d..90f25bd7c8 100644 --- a/source/lib/src/library/components/backtrace.cpp +++ b/source/lib/omnitrace/src/library/components/backtrace.cpp @@ -231,7 +231,7 @@ backtrace::get_thread_cpu_timestamp() const void backtrace::sample(int signum) { - if(get_state() != State::Active) + if(signum != -1 && get_state() != State::Active) { OMNITRACE_CONDITIONAL_PRINT( get_debug_sampling(), @@ -341,7 +341,6 @@ backtrace::configure(bool _setup, int64_t _tid) OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid); sampler_running_instances::construct(true); - backtrace_init_instances::construct(); sampling::sampler_instances::construct("omnitrace", _tid, *_signal_types); _sampler->set_signals(*_signal_types); _sampler->set_flags(SA_RESTART); @@ -363,6 +362,7 @@ backtrace::configure(bool _setup, int64_t _tid) _sampler->get_rate(units::sec)); // (void) sampling::sampler_t::get_samplers(_tid); + backtrace_init_instances::construct(); get_backtrace_init(_tid)->sample(); _sampler->configure(false); _sampler->start(); @@ -617,6 +617,8 @@ backtrace::post_process(int64_t _tid) auto* _bt = ditr->get(); + if(_bt->m_ts < _last_wall_ts) continue; + double _elapsed_wc = (_bt->m_ts - _last_wall_ts).count(); double _elapsed_cc = (_bt->m_thr_cpu_ts - _last_cpu_ts); diff --git a/source/lib/src/library/components/fork_gotcha.cpp b/source/lib/omnitrace/src/library/components/fork_gotcha.cpp similarity index 100% rename from source/lib/src/library/components/fork_gotcha.cpp rename to source/lib/omnitrace/src/library/components/fork_gotcha.cpp diff --git a/source/lib/src/library/components/mpi_gotcha.cpp b/source/lib/omnitrace/src/library/components/mpi_gotcha.cpp similarity index 97% rename from source/lib/src/library/components/mpi_gotcha.cpp rename to source/lib/omnitrace/src/library/components/mpi_gotcha.cpp index 7a1cc82176..4a1022a4cf 100644 --- a/source/lib/src/library/components/mpi_gotcha.cpp +++ b/source/lib/omnitrace/src/library/components/mpi_gotcha.cpp @@ -86,8 +86,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "[%s] %s(int*, char***)\n", __FUNCTION__, _data.tool_id.c_str()); - if(get_state() == ::omnitrace::State::DelayedInit) - get_state() = ::omnitrace::State::PreInit; + if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); omnitrace_push_trace_hidden(_data.tool_id.c_str()); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) @@ -102,8 +101,8 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "[%s] %s(int*, char***, int, int*)\n", __FUNCTION__, _data.tool_id.c_str()); - if(get_state() == ::omnitrace::State::DelayedInit) - get_state() = ::omnitrace::State::PreInit; + + if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); omnitrace_push_trace_hidden(_data.tool_id.c_str()); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) diff --git a/source/lib/src/library/components/omnitrace.cpp b/source/lib/omnitrace/src/library/components/omnitrace.cpp similarity index 100% rename from source/lib/src/library/components/omnitrace.cpp rename to source/lib/omnitrace/src/library/components/omnitrace.cpp diff --git a/source/lib/src/library/components/pthread_gotcha.cpp b/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp similarity index 98% rename from source/lib/src/library/components/pthread_gotcha.cpp rename to source/lib/omnitrace/src/library/components/pthread_gotcha.cpp index cdadbff29f..85e957d3e2 100644 --- a/source/lib/src/library/components/pthread_gotcha.cpp +++ b/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp @@ -140,6 +140,8 @@ pthread_gotcha::wrapper::operator()() const { _tid = threading::get_id(); threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); + // initialize thread-local statics + (void) tim::get_unw_backtrace<12, 1, false>(); { std::unique_lock _lk{ bundles_mutex }; if(comp::roctracer::is_setup()) diff --git a/source/lib/src/library/components/rocm_smi.cpp b/source/lib/omnitrace/src/library/components/rocm_smi.cpp similarity index 85% rename from source/lib/src/library/components/rocm_smi.cpp rename to source/lib/omnitrace/src/library/components/rocm_smi.cpp index ccf922aafd..9645ec215b 100644 --- a/source/lib/src/library/components/rocm_smi.cpp +++ b/source/lib/omnitrace/src/library/components/rocm_smi.cpp @@ -86,7 +86,7 @@ check_error(rsmi_status_t ec) } std::atomic& -get_rocm_smi_state() +get_state() { static std::atomic _v{ State::PreInit }; return _v; @@ -151,7 +151,7 @@ config() void sample() { - if(get_rocm_smi_state() != State::Active) return; + if(rocm_smi::get_state() != State::Active) return; for(auto itr : data::device_list) { @@ -168,63 +168,9 @@ sample() void set_state(State _v) { - get_rocm_smi_state().store(_v); + rocm_smi::get_state().store(_v); } -/* -void -data::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) -{ - threading::set_thread_name("omni.rocm_smi"); - - // notify thread started - if(_ready) _ready->set_value(); - - std::vector*> _bundle_data{}; - _bundle_data.resize(device_count, nullptr); - for(size_t i = 0; i < device_count; ++i) - { - if(device_list.count(i) > 0) - { - _bundle_data.at(i) = &sampler_instances::instances().at(i); - if(!*_bundle_data.at(i)) *_bundle_data.at(i) = std::make_unique(); - } - } - - OMNITRACE_CONDITIONAL_BASIC_PRINT( - get_verbose() > 0 || get_debug(), - "Polling rocm-smi for %zu device(s) at an interval of %f seconds...\n", - device_list.size(), - std::chrono::duration_cast>(_interval).count()); - - get_initial().resize(device_count); - for(auto itr : device_list) - get_initial().at(itr).sample(itr); - - auto _now = std::chrono::steady_clock::now(); - while(_state && _state->load() != State::Finalized && get_state() != State::Finalized) - { - std::this_thread::sleep_until(_now); - if(_state->load() != State::Active) continue; - for(auto itr : device_list) - { - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), - "Polling rocm-smi for device %u...\n", itr); - auto& _data = *_bundle_data.at(itr); - if(!_data) continue; - _data->emplace_back(data{ itr }); - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), " %s\n", - TIMEMORY_JOIN("", _data->back()).c_str()); - } - while(_now < std::chrono::steady_clock::now()) - _now += _interval; - } - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), "Polling rocm-smi completed...\n"); - - if(polling_finished) polling_finished->set_value(); -} -*/ - std::vector& data::get_initial() { @@ -236,7 +182,7 @@ bool data::setup() { perfetto_counter_track::init(); - set_state(State::PreInit); + rocm_smi::set_state(State::PreInit); return true; } @@ -244,7 +190,7 @@ bool data::shutdown() { OMNITRACE_DEBUG("Shutting down rocm-smi...\n"); - set_state(State::Finalized); + rocm_smi::set_state(State::Finalized); return true; } diff --git a/source/lib/src/library/components/roctracer.cpp b/source/lib/omnitrace/src/library/components/roctracer.cpp similarity index 100% rename from source/lib/src/library/components/roctracer.cpp rename to source/lib/omnitrace/src/library/components/roctracer.cpp diff --git a/source/lib/src/library/components/roctracer_callbacks.cpp b/source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp similarity index 100% rename from source/lib/src/library/components/roctracer_callbacks.cpp rename to source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp diff --git a/source/lib/omnitrace/src/library/components/user_region.cpp b/source/lib/omnitrace/src/library/components/user_region.cpp new file mode 100644 index 0000000000..8b39b44968 --- /dev/null +++ b/source/lib/omnitrace/src/library/components/user_region.cpp @@ -0,0 +1,52 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/user_region.hpp" +#include "library/api.hpp" +#include "library/components/fwd.hpp" + +namespace omnitrace +{ +namespace component +{ +void +user_region::start() +{ + if(m_prefix) omnitrace_push_region_hidden(m_prefix); +} + +void +user_region::stop() +{ + if(m_prefix) omnitrace_pop_region_hidden(m_prefix); +} + +void +user_region::set_prefix(const char* _prefix) +{ + m_prefix = _prefix; +} +} // namespace component +} // namespace omnitrace + +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::user_region) +TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(omnitrace_user_region, false, void) diff --git a/source/lib/src/library/config.cpp b/source/lib/omnitrace/src/library/config.cpp similarity index 97% rename from source/lib/src/library/config.cpp rename to source/lib/omnitrace/src/library/config.cpp index 722247fd49..b0ece73ae6 100644 --- a/source/lib/src/library/config.cpp +++ b/source/lib/omnitrace/src/library/config.cpp @@ -150,6 +150,12 @@ configure_settings() "Enable support for Kokkos Tools", false, "kokkos", "backend"); +#if defined(TIMEMORY_USE_OMPT) + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_OMPT", + "Enable support for OpenMP-Tools", true, "openmp", "ompt", + "backend"); +#endif + OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_INSTRUMENTATION_INTERVAL", "Instrumentation only takes measurements once every N " "function calls (not statistical)", @@ -736,6 +742,17 @@ get_use_kokkosp() return static_cast&>(*_v->second).get(); } +bool +get_use_ompt() +{ +#if defined(TIMEMORY_USE_OMPT) + static auto _v = get_config()->find("OMNITRACE_USE_OMPT"); + return static_cast&>(*_v->second).get(); +#else + return false; +#endif +} + bool get_critical_trace_debug() { @@ -956,6 +973,20 @@ get_state() return _v; } +State +set_state(State _n) +{ + auto _o = get_state(); + OMNITRACE_CONDITIONAL_PRINT_F(get_debug_init(), "Setting state :: %s -> %s\n", + std::to_string(_o).c_str(), std::to_string(_n).c_str()); + // state should always be increased, not decreased + OMNITRACE_CI_BASIC_THROW(_n < _o, + "State is being assigned to a lesser value :: %s -> %s", + std::to_string(_o).c_str(), std::to_string(_n).c_str()); + get_state() = _n; + return _o; +} + std::atomic& get_cpu_cid() { diff --git a/source/lib/src/library/cpu_freq.cpp b/source/lib/omnitrace/src/library/cpu_freq.cpp similarity index 100% rename from source/lib/src/library/cpu_freq.cpp rename to source/lib/omnitrace/src/library/cpu_freq.cpp diff --git a/source/lib/src/library/critical_trace.cpp b/source/lib/omnitrace/src/library/critical_trace.cpp similarity index 100% rename from source/lib/src/library/critical_trace.cpp rename to source/lib/omnitrace/src/library/critical_trace.cpp diff --git a/source/lib/src/library/gpu.cpp b/source/lib/omnitrace/src/library/gpu.cpp similarity index 100% rename from source/lib/src/library/gpu.cpp rename to source/lib/omnitrace/src/library/gpu.cpp diff --git a/source/lib/src/library/kokkosp.cpp b/source/lib/omnitrace/src/library/kokkosp.cpp similarity index 99% rename from source/lib/src/library/kokkosp.cpp rename to source/lib/omnitrace/src/library/kokkosp.cpp index ba3374c571..78f83a8deb 100644 --- a/source/lib/src/library/kokkosp.cpp +++ b/source/lib/omnitrace/src/library/kokkosp.cpp @@ -22,10 +22,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -// used by Kokkos decls -#if !defined(TIMEMORY_LIBRARY_SOURCE) -# define TIMEMORY_LIBRARY_SOURCE 1 -#endif +#include "library/defines.hpp" + +#define TIMEMORY_KOKKOSP_POSTFIX OMNITRACE_PUBLIC_API #include "library/components/omnitrace.hpp" #include "library/config.hpp" diff --git a/source/lib/omnitrace/src/library/ompt.cpp b/source/lib/omnitrace/src/library/ompt.cpp new file mode 100644 index 0000000000..cf52a8bb89 --- /dev/null +++ b/source/lib/omnitrace/src/library/ompt.cpp @@ -0,0 +1,114 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/defines.hpp" + +#include + +#if defined(TIMEMORY_USE_OMPT) + +# include "library/components/fwd.hpp" +# include "library/components/user_region.hpp" +# include "library/config.hpp" +# include "library/debug.hpp" + +# include +# include +# include + +# include + +using api_t = TIMEMORY_API; +using ompt_handle_t = tim::component::ompt_handle; +using ompt_context_t = tim::openmp::context_handler; +using ompt_toolset_t = typename ompt_handle_t::toolset_type; +using ompt_bundle_t = tim::component_tuple; + +extern "C" +{ + ompt_start_tool_result_t* ompt_start_tool(unsigned int, + const char*) OMNITRACE_PUBLIC_API; +} + +namespace omnitrace +{ +namespace ompt +{ +namespace +{ +std::unique_ptr f_bundle = {}; +bool _init_toolset_off = (trait::runtime_enabled::set(false), true); +} // namespace + +void +setup() +{ + OMNITRACE_VERBOSE(1, "Setting up OMPT...\n"); + trait::runtime_enabled::set(config::get_use_ompt()); + comp::user_ompt_bundle::global_init(); + comp::user_ompt_bundle::reset(); + // provide environment variable for enabling/disabling + if(config::get_use_ompt()) + { + tim::auto_lock_t lk{ tim::type_mutex() }; + comp::user_ompt_bundle::configure(); + f_bundle = + std::make_unique("ompt", quirk::config{}); + } +} + +void +shutdown() +{ + OMNITRACE_VERBOSE(1, "Shutting down OMPT...\n"); + if(f_bundle) + { + f_bundle->stop(); + ompt_context_t::cleanup(); + trait::runtime_enabled::set(false); + } + f_bundle.reset(); +} +} // namespace ompt +} // namespace omnitrace + +// include the ompt_start_tool definition +# include + +#else +namespace omnitrace +{ +namespace ompt +{ +void +setup() +{} + +void +shutdown() +{} +} // namespace ompt +} // namespace omnitrace + +#endif diff --git a/source/lib/src/library/perfetto.cpp b/source/lib/omnitrace/src/library/perfetto.cpp similarity index 100% rename from source/lib/src/library/perfetto.cpp rename to source/lib/omnitrace/src/library/perfetto.cpp diff --git a/source/lib/src/library/ptl.cpp b/source/lib/omnitrace/src/library/ptl.cpp similarity index 100% rename from source/lib/src/library/ptl.cpp rename to source/lib/omnitrace/src/library/ptl.cpp diff --git a/source/lib/src/library/sampling.cpp b/source/lib/omnitrace/src/library/sampling.cpp similarity index 100% rename from source/lib/src/library/sampling.cpp rename to source/lib/omnitrace/src/library/sampling.cpp diff --git a/source/lib/src/library/state.cpp b/source/lib/omnitrace/src/library/state.cpp similarity index 100% rename from source/lib/src/library/state.cpp rename to source/lib/omnitrace/src/library/state.cpp diff --git a/source/lib/src/library/thread_data.cpp b/source/lib/omnitrace/src/library/thread_data.cpp similarity index 100% rename from source/lib/src/library/thread_data.cpp rename to source/lib/omnitrace/src/library/thread_data.cpp diff --git a/source/lib/src/library/thread_sampler.cpp b/source/lib/omnitrace/src/library/thread_sampler.cpp similarity index 100% rename from source/lib/src/library/thread_sampler.cpp rename to source/lib/omnitrace/src/library/thread_sampler.cpp diff --git a/source/lib/src/library/timemory.cpp b/source/lib/omnitrace/src/library/timemory.cpp similarity index 100% rename from source/lib/src/library/timemory.cpp rename to source/lib/omnitrace/src/library/timemory.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 370333dce5..73302d76d6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,6 +34,19 @@ set(_base_environment "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" ) +set(_ompt_environment + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_USE_OMPT=ON" + "OMNITRACE_CRITICAL_TRACE=OFF" + "OMP_PROC_BIND=spread" + "OMP_PLACES=threads" + "OMP_NUM_THREADS=2" + "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" + ) + set(_perfetto_environment "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=OFF" @@ -75,7 +88,7 @@ set(_fast_environment function(OMNITRACE_ADD_TEST) cmake_parse_arguments( TEST - "" # options + "SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options "NAME;TARGET;MPI;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value args "REWRITE_ARGS;RUNTIME_ARGS;RUN_ARGS;ENVIRONMENT;LABELS;PROPERTIES" # multiple # value args @@ -126,57 +139,74 @@ function(OMNITRACE_ADD_TEST) COMMAND ${COMMAND_PREFIX} $ ${TEST_RUN_ARGS} WORKING_DIRECTORY $) - add_test( - NAME ${TEST_NAME}-binary-rewrite - COMMAND - $ -o - $/${TEST_NAME}.inst ${TEST_REWRITE_ARGS} - -- $ - WORKING_DIRECTORY $) + if(NOT TEST_SKIP_REWRITE) + add_test( + NAME ${TEST_NAME}-binary-rewrite + COMMAND + $ -o + $/${TEST_NAME}.inst + ${TEST_REWRITE_ARGS} -- $ + WORKING_DIRECTORY $) - add_test( - NAME ${TEST_NAME}-binary-rewrite-sampling - COMMAND - $ -o - $/${TEST_NAME}.samp -M sampling - ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY $) + if(NOT TEST_SKIP_SAMPLING) + add_test( + NAME ${TEST_NAME}-binary-rewrite-sampling + COMMAND + $ -o + $/${TEST_NAME}.samp -M sampling + ${TEST_REWRITE_ARGS} -- $ + WORKING_DIRECTORY $) + endif() - add_test( - NAME ${TEST_NAME}-binary-rewrite-run - COMMAND ${COMMAND_PREFIX} $/${TEST_NAME}.inst + add_test( + NAME ${TEST_NAME}-binary-rewrite-run + COMMAND + ${COMMAND_PREFIX} $/${TEST_NAME}.inst ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY $) - add_test( - NAME ${TEST_NAME}-binary-rewrite-run-sampling - COMMAND ${COMMAND_PREFIX} $/${TEST_NAME}.samp - ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + if(NOT TEST_SKIP_SAMPLING) + add_test( + NAME ${TEST_NAME}-binary-rewrite-run-sampling + COMMAND + ${COMMAND_PREFIX} + $/${TEST_NAME}.samp + ${TEST_RUN_ARGS} + WORKING_DIRECTORY $) + endif() + endif() - add_test( - NAME ${TEST_NAME}-runtime-instrument - COMMAND $ ${TEST_RUNTIME_ARGS} -- - $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + if(NOT TEST_SKIP_RUNTIME) + add_test( + NAME ${TEST_NAME}-runtime-instrument + COMMAND $ ${TEST_RUNTIME_ARGS} -- + $ ${TEST_RUN_ARGS} + WORKING_DIRECTORY $) - add_test( - NAME ${TEST_NAME}-runtime-instrument-sampling - COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} -- - $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + if(NOT TEST_SKIP_SAMPLING) + add_test( + NAME ${TEST_NAME}-runtime-instrument-sampling + COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} + -- $ ${TEST_RUN_ARGS} + WORKING_DIRECTORY $) + endif() + endif() - set_tests_properties(${TEST_NAME}-binary-rewrite-run - PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite) + if(TEST ${TEST_NAME}-binary-rewrite-run) + set_tests_properties(${TEST_NAME}-binary-rewrite-run + PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite) + endif() - set_tests_properties(${TEST_NAME}-binary-rewrite-run-sampling - PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite-sampling) + if(TEST ${TEST_NAME}-binary-rewrite-run-sampling) + set_tests_properties(${TEST_NAME}-binary-rewrite-run-sampling + PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite-sampling) + endif() foreach( _TEST baseline binary-rewrite binary-rewrite-run binary-rewrite-sampling binary-rewrite-run-sampling runtime-instrument runtime-instrument-sampling) - string(REPLACE "-run-" "-" _prefix "${TEST_NAME}-${_TEST}-") + string(REPLACE "-run-" "-" _prefix "${TEST_NAME}-${_TEST}/") set(_environ "${TEST_ENVIRONMENT}") set(_labels "${_TEST}") set(_timeout ${TEST_REWRITE_TIMEOUT}) @@ -254,6 +284,23 @@ omnitrace_add_test( RUN_ARGS 10 ${NUM_THREADS} 1000 ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") +omnitrace_add_test( + NAME user-api + TARGET user-api + REWRITE_ARGS -e -v 2 --min-address-range-loop=64 + RUNTIME_ARGS + -e + -v + 1 + --min-address-range-loop=64 + --label + file + line + return + args + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") + omnitrace_add_test( NAME parallel-overhead-no-save-fpr TARGET parallel-overhead @@ -280,7 +327,7 @@ omnitrace_add_test( TARGET lulesh MPI ${LULESH_USE_MPI} NUM_PROCS 8 - REWRITE_ARGS -e -v 2 + REWRITE_ARGS -e -v 2 --label file line return args RUNTIME_ARGS -e -v @@ -357,3 +404,19 @@ omnitrace_add_test( RUN_ARGS -i 10 -s 20 -p ENVIRONMENT "${_timemory_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF") + +omnitrace_add_test( + NAME openmp-cg + TARGET openmp-cg + REWRITE_ARGS -e -v 2 --instrument-loops + RUNTIME_ARGS -e -v 1 --label return args + RUN_ARGS + ENVIRONMENT "${_ompt_environment}") + +omnitrace_add_test( + NAME openmp-lu + TARGET openmp-lu + REWRITE_ARGS -e -v 2 --instrument-loops + RUNTIME_ARGS -e -v 1 --label return args + RUN_ARGS REWRITE_TIMEOUT 180 RUNTIME_TIMEOUT 360 + ENVIRONMENT "${_ompt_environment}")