From b29cfac106f1cf41915f6ecc2175adfd7d143db7 Mon Sep 17 00:00:00 2001 From: David Galiffi Date: Fri, 13 Dec 2024 18:48:39 -0500 Subject: [PATCH] Update to use rocprofiler-sdk (#55) - Renames the CMake option "ROCPROFSYS_USE_HIP" to "ROCPROFSYS_USE_ROCM" - Remove the "ROCPROFSYS_USE_ROCM_SMI option. Controlled with the "ROCPROFSYS_USE_ROCM" option, instead. - Runtime configuration can still toggle ROCPROFSYS_USE_ROCM_SMI to disable the sampling. - Rename ROCPROFSYS_HIP_VERSION macro to ROCPROFSYS_ROCM_VERSION and remove blocks for `ROCPROFSYS_ROCM_VERSION < 60000` - Remove ROCPROFSYS_USE_ROCTRACER and ROCPROFSYS_USE_ROCPROFILER - Update test cases - Update docker files and workflows to install cmake 3.21, which is required for the rocprofiler-sdk findPackage script. - Removed rocm-6.2 from workflows due to a rocprofiler-sdk API change. [ROCm/rocprofiler-systems commit: 88aa2d3cbe13bdfb3dc8ebf2669556bb331d6a53] --- .../.github/workflows/containers.yml | 40 +- .../.github/workflows/cpack.yml | 35 - .../.github/workflows/opensuse.yml | 4 +- .../.github/workflows/redhat.yml | 8 +- .../.github/workflows/ubuntu-focal.yml | 26 +- .../.github/workflows/ubuntu-jammy.yml | 21 +- .../.github/workflows/ubuntu-noble.yml | 2 +- projects/rocprofiler-systems/CMakeLists.txt | 43 +- .../cmake/ConfigCPack.cmake | 15 +- projects/rocprofiler-systems/cmake/PAPI.cmake | 32 +- .../rocprofiler-systems/cmake/Packages.cmake | 94 +- .../cmake/Templates/modulefile.in | 4 - .../cmake/Templates/setup-env.sh.in | 9 - .../docker/Dockerfile.opensuse | 2 +- .../docker/Dockerfile.opensuse.ci | 2 +- .../docker/Dockerfile.rhel | 2 +- .../docker/Dockerfile.rhel.ci | 2 +- .../docker/Dockerfile.ubuntu | 4 +- .../docker/Dockerfile.ubuntu.ci | 4 +- .../how-to/configuring-runtime-options.rst | 14 +- .../docs/how-to/sampling-call-stack.rst | 16 +- .../docs/install/install.rst | 32 +- .../scripts/build-release.sh | 8 +- .../source/bin/CMakeLists.txt | 15 +- .../source/bin/rocprof-sys-avail/avail.cpp | 21 +- .../bin/rocprof-sys-avail/generate_config.cpp | 2 +- .../bin/rocprof-sys-avail/info_type.cpp | 2 - .../source/bin/rocprof-sys-causal/impl.cpp | 9 - .../bin/rocprof-sys-instrument/CMakeLists.txt | 2 + .../rocprof-sys-instrument/internal_libs.cpp | 26 +- .../rocprof-sys-instrument.cpp | 10 +- .../source/bin/rocprof-sys-sample/impl.cpp | 96 +- .../source/lib/CMakeLists.txt | 15 +- .../source/lib/common/CMakeLists.txt | 4 +- .../source/lib/common/defines.h.in | 24 +- .../source/lib/common/setup.hpp | 142 -- .../source/lib/common/static_object.hpp | 207 +++ .../source/lib/common/synchronized.hpp | 167 +++ .../source/lib/core/CMakeLists.txt | 10 +- .../source/lib/core/argparse.cpp | 97 +- .../source/lib/core/categories.hpp | 32 +- .../source/lib/core/components/fwd.hpp | 10 +- .../source/lib/core/config.cpp | 257 +--- .../source/lib/core/config.hpp | 62 +- .../lib/core/containers/stable_vector.hpp | 18 +- .../source/lib/core/gpu.cpp | 418 +----- .../source/lib/core/gpu.hpp | 4 +- .../source/lib/core/hip_runtime.hpp | 27 +- .../source/lib/core/perfetto.hpp | 1 + .../source/lib/core/rccl.hpp | 10 +- .../source/lib/core/rocprofiler-sdk.cpp | 576 ++++++++ .../source/lib/core/rocprofiler-sdk.hpp | 70 + .../source/lib/core/state.cpp | 6 +- .../source/lib/core/utility.hpp | 9 + .../source/lib/rocprof-sys-dl/CMakeLists.txt | 3 +- .../source/lib/rocprof-sys-dl/dl.cpp | 189 ++- .../source/lib/rocprof-sys-dl/dl/dl.hpp | 24 +- .../source/lib/rocprof-sys-dl/main.c | 33 +- .../rocprofiler-systems/categories.h | 16 +- .../source/lib/rocprof-sys/library.cpp | 45 +- .../lib/rocprof-sys/library/CMakeLists.txt | 27 +- .../library/components/CMakeLists.txt | 12 - .../library/components/category_region.hpp | 8 +- .../components/pthread_create_gotcha.cpp | 6 +- .../library/components/rocprofiler.cpp | 193 --- .../library/components/rocprofiler.hpp | 241 --- .../library/components/roctracer.cpp | 396 ----- .../library/components/roctracer.hpp | 117 -- .../source/lib/rocprof-sys/library/rocm.cpp | 210 +-- .../source/lib/rocprof-sys/library/rocm.hpp | 40 +- .../rocprof-sys/library/rocm/CMakeLists.txt | 7 - .../library/rocm/hsa_rsrc_factory.cpp | 1027 ------------- .../library/rocm/hsa_rsrc_factory.hpp | 582 -------- .../lib/rocprof-sys/library/rocm_smi.hpp | 5 +- .../rocprof-sys/library/rocprofiler-sdk.cpp | 1308 +++++++++++++++++ .../{rocprofiler.hpp => rocprofiler-sdk.hpp} | 62 +- .../library/rocprofiler-sdk/CMakeLists.txt | 9 + .../library/rocprofiler-sdk/counters.cpp | 135 ++ .../library/rocprofiler-sdk/counters.hpp | 168 +++ .../library/rocprofiler-sdk/fwd.cpp | 270 ++++ .../library/rocprofiler-sdk/fwd.hpp | 252 ++++ .../lib/rocprof-sys/library/rocprofiler.cpp | 834 ----------- .../lib/rocprof-sys/library/roctracer.cpp | 967 ------------ .../lib/rocprof-sys/library/roctracer.hpp | 89 -- .../lib/rocprof-sys/library/runtime.hpp | 1 - .../tests/rocprof-sys-rocm-tests.cmake | 36 +- .../tests/rocprof-sys-testing.cmake | 23 +- 87 files changed, 3842 insertions(+), 6261 deletions(-) create mode 100644 projects/rocprofiler-systems/source/lib/common/static_object.hpp create mode 100644 projects/rocprofiler-systems/source/lib/common/synchronized.hpp create mode 100644 projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp create mode 100644 projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/CMakeLists.txt delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.hpp create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp rename projects/rocprofiler-systems/source/lib/rocprof-sys/library/{rocprofiler.hpp => rocprofiler-sdk.hpp} (54%) create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.cpp create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.hpp create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.hpp diff --git a/projects/rocprofiler-systems/.github/workflows/containers.yml b/projects/rocprofiler-systems/.github/workflows/containers.yml index 86d12fcf4d..09bb556028 100644 --- a/projects/rocprofiler-systems/.github/workflows/containers.yml +++ b/projects/rocprofiler-systems/.github/workflows/containers.yml @@ -39,12 +39,10 @@ jobs: version: "15.5" - distro: "opensuse" version: "15.6" - - distro: "rhel" - version: "8.8" - distro: "rhel" version: "8.10" - distro: "rhel" - version: "9.2" + version: "9.3" - distro: "rhel" version: "9.4" @@ -90,9 +88,6 @@ jobs: - os-distro: "ubuntu" os-version: "20.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "20.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "20.04" rocm-version: "6.3" @@ -100,9 +95,6 @@ jobs: - os-distro: "ubuntu" os-version: "22.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "22.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "22.04" rocm-version: "6.3" @@ -110,9 +102,6 @@ jobs: - os-distro: "ubuntu" os-version: "24.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "24.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "24.04" rocm-version: "6.3" @@ -120,9 +109,6 @@ jobs: - os-distro: "opensuse" os-version: "15.5" rocm-version: "0.0" - - os-distro: "opensuse" - os-version: "15.5" - rocm-version: "6.2" - os-distro: "opensuse" os-version: "15.5" rocm-version: "6.3" @@ -130,43 +116,19 @@ jobs: - os-distro: "opensuse" os-version: "15.6" rocm-version: "0.0" - - os-distro: "opensuse" - os-version: "15.6" - rocm-version: "6.2" - os-distro: "opensuse" os-version: "15.6" rocm-version: "6.3" - # RHEL 8.9 - - os-distro: "rhel" - os-version: "8.9" - rocm-version: "0.0" - - os-distro: "rhel" - os-version: "8.9" - rocm-version: "6.2" - # RHEL 8.10 - os-distro: "rhel" os-version: "8.10" rocm-version: "0.0" - - os-distro: "rhel" - os-version: "8.10" - rocm-version: "6.2" - os-distro: "rhel" os-version: "8.10" rocm-version: "6.3" - # RHEL 9.3 - - os-distro: "rhel" - os-version: "9.3" - rocm-version: "0.0" - - os-distro: "rhel" - os-version: "9.3" - rocm-version: "6.2" # RHEL 9.4 - os-distro: "rhel" os-version: "9.4" rocm-version: "0.0" - - os-distro: "rhel" - os-version: "9.4" - rocm-version: "6.2" - os-distro: "rhel" os-version: "9.4" rocm-version: "6.3" diff --git a/projects/rocprofiler-systems/.github/workflows/cpack.yml b/projects/rocprofiler-systems/.github/workflows/cpack.yml index 5bb01cea1f..e922318f70 100644 --- a/projects/rocprofiler-systems/.github/workflows/cpack.yml +++ b/projects/rocprofiler-systems/.github/workflows/cpack.yml @@ -37,9 +37,6 @@ jobs: - os-distro: "ubuntu" os-version: "20.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "20.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "20.04" rocm-version: "6.3" @@ -47,9 +44,6 @@ jobs: - os-distro: "ubuntu" os-version: "22.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "22.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "22.04" rocm-version: "6.3" @@ -57,9 +51,6 @@ jobs: - os-distro: "ubuntu" os-version: "24.04" rocm-version: "0.0" - - os-distro: "ubuntu" - os-version: "24.04" - rocm-version: "6.2" - os-distro: "ubuntu" os-version: "24.04" rocm-version: "6.3" @@ -67,9 +58,6 @@ jobs: - os-distro: "opensuse" os-version: "15.5" rocm-version: "0.0" - - os-distro: "opensuse" - os-version: "15.5" - rocm-version: "6.2" - os-distro: "opensuse" os-version: "15.5" rocm-version: "6.3" @@ -77,43 +65,20 @@ jobs: - os-distro: "opensuse" os-version: "15.6" rocm-version: "0.0" - - os-distro: "opensuse" - os-version: "15.6" - rocm-version: "6.2" - os-distro: "opensuse" os-version: "15.6" rocm-version: "6.3" - # RHEL 8.9 - - os-distro: "rhel" - os-version: "8.9" - rocm-version: "0.0" - - os-distro: "rhel" - os-version: "8.9" - rocm-version: "6.2" # RHEL 8.10 - os-distro: "rhel" os-version: "8.10" rocm-version: "0.0" - - os-distro: "rhel" - os-version: "8.10" - rocm-version: "6.2" - os-distro: "rhel" os-version: "8.10" rocm-version: "6.3" - # RHEL 9.3 - - os-distro: "rhel" - os-version: "9.3" - rocm-version: "0.0" - - os-distro: "rhel" - os-version: "9.3" - rocm-version: "6.2" # RHEL 9.4 - os-distro: "rhel" os-version: "9.4" rocm-version: "0.0" - - os-distro: "rhel" - os-version: "9.4" - rocm-version: "6.2" - os-distro: "rhel" os-version: "9.4" rocm-version: "6.3" diff --git a/projects/rocprofiler-systems/.github/workflows/opensuse.yml b/projects/rocprofiler-systems/.github/workflows/opensuse.yml index 89ae9ba727..dad6a343ba 100644 --- a/projects/rocprofiler-systems/.github/workflows/opensuse.yml +++ b/projects/rocprofiler-systems/.github/workflows/opensuse.yml @@ -66,7 +66,7 @@ jobs: fi python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done - name: Configure Env @@ -93,7 +93,7 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems -DROCPROFSYS_BUILD_TESTING=ON -DROCPROFSYS_USE_MPI=OFF - -DROCPROFSYS_USE_HIP=OFF + -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PYTHON=ON -DROCPROFSYS_INSTALL_PERFETTO_TOOLS=OFF diff --git a/projects/rocprofiler-systems/.github/workflows/redhat.yml b/projects/rocprofiler-systems/.github/workflows/redhat.yml index 0e9491760d..d582f14dc5 100644 --- a/projects/rocprofiler-systems/.github/workflows/redhat.yml +++ b/projects/rocprofiler-systems/.github/workflows/redhat.yml @@ -46,8 +46,8 @@ jobs: fail-fast: false matrix: compiler: ['g++'] - os-release: [ '8.10', '9.2', '9.4' ] - rocm-version: [ '0.0', '6.2', '6.3' ] + os-release: [ '8.10', '9.3', '9.4' ] + rocm-version: [ '0.0', '6.3' ] build-type: ['Release'] steps: @@ -70,7 +70,7 @@ jobs: fi python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done - name: Install ROCm Packages @@ -108,7 +108,7 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems -DROCPROFSYS_BUILD_TESTING=ON -DROCPROFSYS_USE_MPI=OFF - -DROCPROFSYS_USE_HIP=${USE_HIP} + -DROCPROFSYS_USE_ROCM=${USE_HIP} -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PYTHON=ON -DROCPROFSYS_USE_MPI_HEADERS=ON diff --git a/projects/rocprofiler-systems/.github/workflows/ubuntu-focal.yml b/projects/rocprofiler-systems/.github/workflows/ubuntu-focal.yml index 31e813d1d0..04eb1577f4 100644 --- a/projects/rocprofiler-systems/.github/workflows/ubuntu-focal.yml +++ b/projects/rocprofiler-systems/.github/workflows/ubuntu-focal.yml @@ -100,7 +100,7 @@ jobs: chmod +x /opt/trace_processor/bin/trace_processor_shell && python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done && apt-get -y --purge autoremove && apt-get -y clean && @@ -145,7 +145,7 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems -DROCPROFSYS_BUILD_TESTING=ON -DROCPROFSYS_USE_MPI=OFF - -DROCPROFSYS_USE_HIP=OFF + -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PAPI=OFF -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} @@ -245,16 +245,10 @@ jobs: fail-fast: false matrix: compiler: ['g++'] - rocm-version: ['6.2'] + rocm-version: ['6.3'] mpi-headers: ['OFF'] build-jobs: ['3'] - ctest-exclude: ['-LE "mpi-example|transpose"'] - include: - - compiler: 'g++' - rocm-version: 'latest' - mpi-headers: 'ON' - build-jobs: '2' - ctest-exclude: '-LE transpose' + ctest-exclude: ['-LE "transpose"'] env: BUILD_TYPE: MinSizeRel @@ -282,7 +276,7 @@ jobs: chmod +x /opt/trace_processor/bin/trace_processor_shell && python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done && apt-get -y --purge autoremove && apt-get -y clean && @@ -336,7 +330,7 @@ jobs: -DROCPROFSYS_BUILD_EXTRA_OPTIMIZATIONS=OFF -DROCPROFSYS_BUILD_LTO=OFF -DROCPROFSYS_USE_MPI=OFF - -DROCPROFSYS_USE_HIP=ON + -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_MAX_THREADS=64 -DROCPROFSYS_USE_PAPI=OFF -DROCPROFSYS_USE_OMPT=OFF @@ -440,7 +434,7 @@ jobs: chmod +x /opt/trace_processor/bin/trace_processor_shell && python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && sudo apt-get -y --purge autoremove && sudo apt-get -y clean @@ -477,7 +471,7 @@ jobs: -DROCPROFSYS_BUILD_TESTING=ON -DROCPROFSYS_BUILD_DYNINST=ON -DROCPROFSYS_USE_MPI=${USE_MPI} - -DROCPROFSYS_USE_HIP=OFF + -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} -DROCPROFSYS_USE_OMPT=${{ matrix.ompt }} -DROCPROFSYS_USE_PAPI=${{ matrix.papi }} @@ -593,7 +587,7 @@ jobs: chmod +x /opt/trace_processor/bin/trace_processor_shell && python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done && apt-get -y --purge autoremove && apt-get -y clean && @@ -625,7 +619,7 @@ jobs: -DROCPROFSYS_USE_PYTHON=ON -DROCPROFSYS_USE_OMPT=ON -DROCPROFSYS_USE_PAPI=ON - -DROCPROFSYS_USE_HIP=OFF + -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_RCCL=OFF -DROCPROFSYS_MAX_THREADS=64 -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl" diff --git a/projects/rocprofiler-systems/.github/workflows/ubuntu-jammy.yml b/projects/rocprofiler-systems/.github/workflows/ubuntu-jammy.yml index 47026737c0..1debacf5fb 100644 --- a/projects/rocprofiler-systems/.github/workflows/ubuntu-jammy.yml +++ b/projects/rocprofiler-systems/.github/workflows/ubuntu-jammy.yml @@ -75,22 +75,7 @@ jobs: static-libgcc: 'OFF' static-libstdcxx: 'OFF' build-dyninst: 'OFF' - rocm-version: '6.2' - - compiler: 'g++' - hip: 'ON' - mpi: 'OFF' - ompt: 'OFF' - papi: 'OFF' - python: 'ON' - lto: 'OFF' - strip: 'OFF' - hidden: 'ON' - build-type: 'Release' - mpi-headers: 'OFF' - static-libgcc: 'OFF' - static-libstdcxx: 'OFF' - build-dyninst: 'OFF' - rocm-version: 'latest' + rocm-version: '6.3' env: OMPI_ALLOW_RUN_AS_ROOT: 1 @@ -116,7 +101,7 @@ jobs: openmpi-bin python3-pip texinfo ${{ matrix.compiler }} && python3 -m pip install --upgrade pip && python3 -m pip install --upgrade numpy perfetto dataclasses && - python3 -m pip install 'cmake==3.18.4' && + python3 -m pip install 'cmake==3.21' && for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done - name: Install ROCm Packages @@ -183,7 +168,7 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems-dev -DROCPROFSYS_BUILD_TESTING=ON -DROCPROFSYS_USE_MPI=${{ matrix.mpi }} - -DROCPROFSYS_USE_HIP=${{ matrix.hip }} + -DROCPROFSYS_USE_ROCM=${{ matrix.hip }} -DROCPROFSYS_USE_OMPT=${{ matrix.ompt }} -DROCPROFSYS_USE_PAPI=${{ matrix.papi }} -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} diff --git a/projects/rocprofiler-systems/.github/workflows/ubuntu-noble.yml b/projects/rocprofiler-systems/.github/workflows/ubuntu-noble.yml index 30e03dd784..68ef66484e 100644 --- a/projects/rocprofiler-systems/.github/workflows/ubuntu-noble.yml +++ b/projects/rocprofiler-systems/.github/workflows/ubuntu-noble.yml @@ -101,7 +101,7 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \ -DROCPROFSYS_BUILD_TESTING=ON \ -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl" \ - -DROCPROFSYS_USE_HIP=${USE_ROCM} \ + -DROCPROFSYS_USE_ROCM=${USE_ROCM} \ -DRCOPROFSYS_USE_PYTHON=ON \ -DROCPROFSYS_STRIP_LIBRARIES=${{ matrix.strip }} \ -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs \ diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index a3e3b60d39..68d13c4dc2 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -176,18 +176,11 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_CLANG_TIDY "Enable clang-tidy" OFF rocprofiler_systems_add_option(ROCPROFSYS_USE_BFD "Enable BFD support (map call-stack samples to LOC)" ON) rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF) -rocprofiler_systems_add_option(ROCPROFSYS_USE_HIP "Enable HIP support" ON) +rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON) rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI" ON) -rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCTRACER "Enable roctracer support" - ${ROCPROFSYS_USE_HIP}) -rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCPROFILER "Enable rocprofiler support" - ${ROCPROFSYS_USE_HIP}) -rocprofiler_systems_add_option( - ROCPROFSYS_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling" - ${ROCPROFSYS_USE_HIP}) rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support" - ${ROCPROFSYS_USE_HIP}) + ${ROCPROFSYS_USE_ROCM}) rocprofiler_systems_add_option( ROCPROFSYS_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) @@ -217,30 +210,10 @@ elseif("$ENV{ROCPROFSYS_CI}") endif() endif() -if(NOT ROCPROFSYS_USE_HIP) - set(ROCPROFSYS_USE_ROCTRACER - OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE) - set(ROCPROFSYS_USE_ROCPROFILER - OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE) - set(ROCPROFSYS_USE_ROCM_SMI - OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE) +if(NOT ROCPROFSYS_USE_ROCM) set(ROCPROFSYS_USE_RCCL OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE) -elseif( - ROCPROFSYS_USE_HIP - AND NOT ROCPROFSYS_USE_ROCTRACER - AND NOT ROCPROFSYS_USE_ROCPROFILER - AND NOT ROCPROFSYS_USE_ROCM_SMI - AND NOT ROCPROFSYS_USE_RCCL) - rocprofiler_systems_message( - AUTHOR_WARNING - "Setting ROCPROFSYS_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled" - ) - set(ROCPROFSYS_USE_HIP OFF) + CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE) endif() if(ROCPROFSYS_BUILD_TESTING) @@ -378,14 +351,6 @@ endif() # # ------------------------------------------------------------------------------# -if(NOT ROCPROFSYS_USE_ROCTRACER AND NOT ROCPROFSYS_USE_ROCPROFILER) - set(ROCPROFSYS_HSA_ENV "# ") -endif() - -if(NOT ROCPROFSYS_USE_ROCPROFILER) - set(ROCPROFSYS_ROCP_ENV "# ") -endif() - configure_file( ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/doc/${PROJECT_NAME}/LICENSE diff --git a/projects/rocprofiler-systems/cmake/ConfigCPack.cmake b/projects/rocprofiler-systems/cmake/ConfigCPack.cmake index ce5fa73146..bb10029b64 100644 --- a/projects/rocprofiler-systems/cmake/ConfigCPack.cmake +++ b/projects/rocprofiler-systems/cmake/ConfigCPack.cmake @@ -54,9 +54,7 @@ set(ROCPROFSYS_CPACK_SYSTEM_NAME CACHE STRING "System name, e.g. Linux or Ubuntu-20.04") set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX "") -if(ROCPROFSYS_USE_HIP - OR ROCPROFSYS_USE_ROCTRACER - OR ROCPROFSYS_USE_ROCM_SMI) +if(ROCPROFSYS_USE_ROCM) set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX "${ROCPROFSYS_CPACK_PACKAGE_SUFFIX}-ROCm-${ROCmVersion_NUMERIC_VERSION}") endif() @@ -159,19 +157,12 @@ if(NOT ROCPROFSYS_BUILD_DYNINST) endif() endif() if(ROCmVersion_FOUND) - set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") - set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") set(_ROCM_SMI_SUFFIX " (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})") endif() -if(ROCPROFSYS_USE_ROCM_SMI) +if(ROCPROFSYS_USE_ROCM) list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib${_ROCM_SMI_SUFFIX}") -endif() -if(ROCPROFSYS_USE_ROCTRACER) - list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}") -endif() -if(ROCPROFSYS_USE_ROCPROFILER) - list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}") + list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-sdk (>= ${rocprofiler-sdk_VERSION})") endif() if(ROCPROFSYS_USE_MPI) if("${ROCPROFSYS_MPI_IMPL}" STREQUAL "openmpi") diff --git a/projects/rocprofiler-systems/cmake/PAPI.cmake b/projects/rocprofiler-systems/cmake/PAPI.cmake index 780dd101d1..394edd43f9 100644 --- a/projects/rocprofiler-systems/cmake/PAPI.cmake +++ b/projects/rocprofiler-systems/cmake/PAPI.cmake @@ -109,13 +109,6 @@ set(_ROCPROFSYS_PAPI_COMPONENTS ) if(ROCPROFSYS_PAPI_AUTO_COMPONENTS) - # rocm - if(ROCPROFSYS_USE_HIP - OR ROCPROFSYS_USE_ROCTRACER - OR ROCPROFSYS_USE_ROCM_SMI) - list(APPEND _ROCPROFSYS_PAPI_COMPONENTS rocm) - endif() - # lmsensors find_path(ROCPROFSYS_PAPI_LMSENSORS_ROOT_DIR NAMES include/sensors/sensors.h include/sensors.h) @@ -209,28 +202,35 @@ externalproject_add( BUILD_IN_SOURCE 1 PATCH_COMMAND ${CMAKE_COMMAND} -E env CC=${PAPI_C_COMPILER} - CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation LIBS=-lrt LDFLAGS=-lrt - ${ROCPROFSYS_PAPI_EXTRA_ENV} /configure --quiet + CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free LIBS=-lrt + LDFLAGS=-lrt ${ROCPROFSYS_PAPI_EXTRA_ENV} /configure --quiet --prefix=${ROCPROFSYS_PAPI_INSTALL_DIR} --with-static-lib=yes --with-shared-lib=no --with-perf-events --with-tests=no --with-components=${_ROCPROFSYS_PAPI_COMPONENTS} --libdir=${ROCPROFSYS_PAPI_INSTALL_DIR}/lib CONFIGURE_COMMAND - ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation + ${CMAKE_COMMAND} -E env + CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s -j ${ROCPROFSYS_PAPI_CONFIGURE_JOBS} - BUILD_COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation - ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s + BUILD_COMMAND + ${CMAKE_COMMAND} -E env + CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free + ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s INSTALL_COMMAND "" BUILD_BYPRODUCTS "${_ROCPROFSYS_PAPI_BUILD_BYPRODUCTS}") # target for re-executing the installation add_custom_target( rocprofiler-systems-papi-install - COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation - ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s - COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation - ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s + COMMAND + ${CMAKE_COMMAND} -E env + CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free + ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s + COMMAND + ${CMAKE_COMMAND} -E env + CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free + ${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s WORKING_DIRECTORY ${ROCPROFSYS_PAPI_SOURCE_DIR}/src COMMENT "Installing PAPI...") diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index eadbfe030e..c508a459d2 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -15,14 +15,12 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-threading rocprofiler_systems_add_interface_library( rocprofiler-systems-dyninst "Provides flags and libraries for Dyninst (dynamic instrumentation)") -rocprofiler_systems_add_interface_library(rocprofiler-systems-hip - "Provides flags and libraries for HIP") +rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm + "Provides flags and libraries for ROCm") rocprofiler_systems_add_interface_library(rocprofiler-systems-roctracer "Provides flags and libraries for roctracer") rocprofiler_systems_add_interface_library(rocprofiler-systems-rocprofiler "Provides flags and libraries for rocprofiler") -rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm-smi - "Provides flags and libraries for rocm-smi") rocprofiler_systems_add_interface_library( rocprofiler-systems-rccl "Provides flags for ROCm Communication Collectives Library (RCCL)") @@ -50,10 +48,7 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition # libraries with relevant compile definitions set(ROCPROFSYS_EXTENSION_LIBRARIES - rocprofiler-systems::rocprofiler-systems-hip - rocprofiler-systems::rocprofiler-systems-roctracer - rocprofiler-systems::rocprofiler-systems-rocprofiler - rocprofiler-systems::rocprofiler-systems-rocm-smi + rocprofiler-systems::rocprofiler-systems-rocm rocprofiler-systems::rocprofiler-systems-rccl rocprofiler-systems::rocprofiler-systems-bfd rocprofiler-systems::rocprofiler-systems-mpi @@ -127,14 +122,11 @@ endforeach() # ----------------------------------------------------------------------------------------# # -# hip version +# ROCm Version # # ----------------------------------------------------------------------------------------# -if(ROCPROFSYS_USE_HIP - OR ROCPROFSYS_USE_ROCTRACER - OR ROCPROFSYS_USE_ROCPROFILER - OR ROCPROFSYS_USE_ROCM_SMI) +if(ROCPROFSYS_USE_ROCM) find_package(ROCmVersion) if(NOT ROCmVersion_FOUND) @@ -164,13 +156,13 @@ if(ROCPROFSYS_USE_HIP endif() set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_FULL_VERSION}) - set(ROCPROFSYS_HIP_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION}) - set(ROCPROFSYS_HIP_VERSION_MINOR ${ROCmVersion_MINOR_VERSION}) - set(ROCPROFSYS_HIP_VERSION_PATCH ${ROCmVersion_PATCH_VERSION}) - set(ROCPROFSYS_HIP_VERSION ${ROCmVersion_TRIPLE_VERSION}) + set(ROCPROFSYS_ROCM_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION}) + set(ROCPROFSYS_ROCM_VERSION_MINOR ${ROCmVersion_MINOR_VERSION}) + set(ROCPROFSYS_ROCM_VERSION_PATCH ${ROCmVersion_PATCH_VERSION}) + set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_TRIPLE_VERSION}) - if(ROCPROFSYS_HIP_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_HIP_VERSION_MINOR - GREATER 3) + if(ROCPROFSYS_ROCM_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_ROCM_VERSION_MINOR + GREATER 3) set(roctracer_kfdwrapper_LIBRARY) endif() @@ -181,64 +173,30 @@ if(ROCPROFSYS_USE_HIP rocprofiler_systems_add_feature(ROCPROFSYS_ROCM_VERSION "ROCm version used by rocprofiler-systems") else() - set(ROCPROFSYS_HIP_VERSION "0.0.0") - set(ROCPROFSYS_HIP_VERSION_MAJOR 0) - set(ROCPROFSYS_HIP_VERSION_MINOR 0) - set(ROCPROFSYS_HIP_VERSION_PATCH 0) + set(ROCPROFSYS_ROCM_VERSION "0.0.0") + set(ROCPROFSYS_ROCM_VERSION_MAJOR 0) + set(ROCPROFSYS_ROCM_VERSION_MINOR 0) + set(ROCPROFSYS_ROCM_VERSION_PATCH 0) endif() # ----------------------------------------------------------------------------------------# # -# HIP +# ROCm # # ----------------------------------------------------------------------------------------# -if(ROCPROFSYS_USE_HIP) - find_package(hip ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-hip - INTERFACE ROCPROFSYS_USE_HIP) - target_link_libraries(rocprofiler-systems-hip INTERFACE hip::host) -endif() +if(ROCPROFSYS_USE_ROCM) + find_package(rocprofiler-sdk ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) + rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm + INTERFACE ROCPROFSYS_USE_ROCM) + target_link_libraries(rocprofiler-systems-rocm + INTERFACE rocprofiler-sdk::rocprofiler-sdk) -# ----------------------------------------------------------------------------------------# -# -# roctracer -# -# ----------------------------------------------------------------------------------------# - -if(ROCPROFSYS_USE_ROCTRACER) - find_package(roctracer ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-roctracer - INTERFACE ROCPROFSYS_USE_ROCTRACER) - target_link_libraries( - rocprofiler-systems-roctracer - INTERFACE roctracer::roctracer rocprofiler-systems::rocprofiler-systems-hip) -endif() - -# ----------------------------------------------------------------------------------------# -# -# rocprofiler -# -# ----------------------------------------------------------------------------------------# -if(ROCPROFSYS_USE_ROCPROFILER) - find_package(rocprofiler ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocprofiler - INTERFACE ROCPROFSYS_USE_ROCPROFILER) - target_link_libraries(rocprofiler-systems-rocprofiler - INTERFACE rocprofiler::rocprofiler) -endif() - -# ----------------------------------------------------------------------------------------# -# -# rocm-smi -# -# ----------------------------------------------------------------------------------------# - -if(ROCPROFSYS_USE_ROCM_SMI) find_package(rocm-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm-smi - INTERFACE ROCPROFSYS_USE_ROCM_SMI) - target_link_libraries(rocprofiler-systems-rocm-smi INTERFACE rocm-smi::rocm-smi) + target_link_libraries(rocprofiler-systems-rocm INTERFACE rocm-smi::rocm-smi) + + # find_package(amd-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) + # target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi) endif() # ----------------------------------------------------------------------------------------# diff --git a/projects/rocprofiler-systems/cmake/Templates/modulefile.in b/projects/rocprofiler-systems/cmake/Templates/modulefile.in index 9e8b7c75ef..cf45f889d6 100644 --- a/projects/rocprofiler-systems/cmake/Templates/modulefile.in +++ b/projects/rocprofiler-systems/cmake/Templates/modulefile.in @@ -14,7 +14,3 @@ prepend-path PATH "${ROOT}/bin" prepend-path LD_LIBRARY_PATH "${ROOT}/@CMAKE_INSTALL_LIBDIR@" prepend-path PYTHONPATH "${ROOT}/@CMAKE_INSTALL_PYTHONDIR@" setenv @PROJECT_NAME_UNDERSCORED@_DIR "${ROOT}/@CMAKE_INSTALL_DATAROOTDIR@/cmake/@PROJECT_NAME@" - -# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@" -# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_REPORT_LOAD_FAILURE 1 -# @ROCPROFSYS_ROCP_ENV@setenv ROCP_TOOL_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@" diff --git a/projects/rocprofiler-systems/cmake/Templates/setup-env.sh.in b/projects/rocprofiler-systems/cmake/Templates/setup-env.sh.in index b6c4a97de0..882c7838c0 100644 --- a/projects/rocprofiler-systems/cmake/Templates/setup-env.sh.in +++ b/projects/rocprofiler-systems/cmake/Templates/setup-env.sh.in @@ -26,12 +26,3 @@ export LD_LIBRARY_PATH export PYTHONPATH export CMAKE_PREFIX_PATH export @PROJECT_NAME_UNDERSCORED@_DIR - -# ROCm environment variables -# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys-dl@CMAKE_SHARED_LIBRARY_SUFFIX@" -# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_REPORT_LOAD_FAILURE=1 -# @ROCPROFSYS_ROCP_ENV@ROCP_TOOL_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@" - -# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_LIB -# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_REPORT_LOAD_FAILURE -# @ROCPROFSYS_ROCP_ENV@export ROCP_TOOL_LIB diff --git a/projects/rocprofiler-systems/docker/Dockerfile.opensuse b/projects/rocprofiler-systems/docker/Dockerfile.opensuse index 2dbb1b5d43..31594cf2c1 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.opensuse +++ b/projects/rocprofiler-systems/docker/Dockerfile.opensuse @@ -25,7 +25,7 @@ RUN zypper --non-interactive update -y && \ zypper --non-interactive install -y -t pattern devel_basis && \ zypper --non-interactive install -y binutils-gold cmake curl dpkg-devel \ gcc-c++ git libnuma-devel openmpi3-devel python3-pip rpm-build wget && \ - python3 -m pip install 'cmake==3.18.4' + python3 -m pip install 'cmake==3.21' ARG ROCM_VERSION=0.0 ARG AMDGPU_RPM=6.2/sle/15.6/amdgpu-install-6.2.60200-1.noarch.rpm diff --git a/projects/rocprofiler-systems/docker/Dockerfile.opensuse.ci b/projects/rocprofiler-systems/docker/Dockerfile.opensuse.ci index 9d7ac120ce..2267cf6a66 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.opensuse.ci +++ b/projects/rocprofiler-systems/docker/Dockerfile.opensuse.ci @@ -31,7 +31,7 @@ RUN zypper --non-interactive update -y && \ gcc-c++ git libnuma-devel openmpi3-devel papi-devel python3-pip \ rpm-build wget && \ zypper --non-interactive clean --all && \ - python3 -m pip install 'cmake==3.18.4' + python3 -m pip install 'cmake==3.21' COPY ./dyninst-source /tmp/dyninst diff --git a/projects/rocprofiler-systems/docker/Dockerfile.rhel b/projects/rocprofiler-systems/docker/Dockerfile.rhel index 298d69ed6a..1a6e2342fa 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.rhel +++ b/projects/rocprofiler-systems/docker/Dockerfile.rhel @@ -18,7 +18,7 @@ RUN yum groupinstall -y "Development Tools" && \ yum install -y --allowerasing cmake curl dpkg-devel numactl-devel openmpi-devel \ papi-devel python3-pip texinfo wget which zlib-devel && \ yum clean all && \ - python3 -m pip install 'cmake==3.18.4' + python3 -m pip install 'cmake==3.21' ARG ROCM_VERSION=0.0 ARG AMDGPU_RPM=6.2/rhel/9.4/amdgpu-install-6.2.60202-1.el9.noarch.rpm diff --git a/projects/rocprofiler-systems/docker/Dockerfile.rhel.ci b/projects/rocprofiler-systems/docker/Dockerfile.rhel.ci index 2429a2bbf5..6b09697396 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.rhel.ci +++ b/projects/rocprofiler-systems/docker/Dockerfile.rhel.ci @@ -22,7 +22,7 @@ RUN yum groupinstall -y "Development Tools" && \ yum install -y --allowerasing cmake curl dpkg-devel numactl-devel \ openmpi-devel papi-devel python3-pip texinfo wget which zlib-devel && \ yum clean all && \ - python3 -m pip install 'cmake==3.18.4' + python3 -m pip install 'cmake==3.21' COPY ./dyninst-source /tmp/dyninst diff --git a/projects/rocprofiler-systems/docker/Dockerfile.ubuntu b/projects/rocprofiler-systems/docker/Dockerfile.ubuntu index d41fcb9c85..a98d692242 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.ubuntu +++ b/projects/rocprofiler-systems/docker/Dockerfile.ubuntu @@ -30,9 +30,9 @@ RUN apt-get update && \ python3-pip rpm texinfo wget && \ OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed 's/=/ /'1 | awk '{print $NF}' | sed 's/"//g') && \ if [ "${OS_VERSION}" == "24.04" ]; then \ - python3 -m pip install --break-system-packages 'cmake==3.18.4'; \ + python3 -m pip install --break-system-packages 'cmake==3.21'; \ else \ - python3 -m pip install 'cmake==3.18.4'; \ + python3 -m pip install 'cmake==3.21'; \ fi RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \ diff --git a/projects/rocprofiler-systems/docker/Dockerfile.ubuntu.ci b/projects/rocprofiler-systems/docker/Dockerfile.ubuntu.ci index 5058ef81c1..36c26f6bdc 100644 --- a/projects/rocprofiler-systems/docker/Dockerfile.ubuntu.ci +++ b/projects/rocprofiler-systems/docker/Dockerfile.ubuntu.ci @@ -31,9 +31,9 @@ RUN apt-get update && \ python3-pip texinfo unzip wget zip zlib1g-dev && \ apt-get autoclean && \ if [ "${OS_VERSION}" == "24.04" ]; then \ - python3 -m pip install --break-system-packages 'cmake==3.18.4' \ + python3 -m pip install --break-system-packages 'cmake==3.21' \ else \ - python3 -m pip install 'cmake==3.18.4'; \ + python3 -m pip install 'cmake==3.21'; \ fi COPY ./dyninst-source /tmp/dyninst diff --git a/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst b/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst index f624318a6b..5e50242969 100644 --- a/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst +++ b/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst @@ -228,7 +228,7 @@ Generating a default configuration file ROCPROFSYS_PROFILE = false ROCPROFSYS_USE_SAMPLING = false ROCPROFSYS_USE_PROCESS_SAMPLING = true - ROCPROFSYS_USE_ROCTRACER = true + ROCPROFSYS_USE_ROCM = true ROCPROFSYS_USE_ROCM_SMI = true ROCPROFSYS_USE_KOKKOSP = false ROCPROFSYS_USE_CODE_COVERAGE = false @@ -248,9 +248,6 @@ Generating a default configuration file ROCPROFSYS_PERFETTO_FILE = perfetto-trace.proto ROCPROFSYS_PERFETTO_FILL_POLICY = discard ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB = 4096 - ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = false - ROCPROFSYS_ROCTRACER_HSA_API = false - ROCPROFSYS_ROCTRACER_HSA_API_TYPES = ROCPROFSYS_SAMPLING_CPUS = ROCPROFSYS_SAMPLING_DELAY = 0.5 ROCPROFSYS_SAMPLING_FREQ = 10 @@ -363,13 +360,10 @@ Viewing the setting descriptions | ROCPROFSYS_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... | | ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... | | ROCPROFSYS_PRECISION | Set the global output precision for ... | - | ROCPROFSYS_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support | - | ROCPROFSYS_ROCTRACER_HSA_API | Enable HSA API tracing support | - | ROCPROFSYS_ROCTRACER_HSA_API_TYPES | HSA API type to collect | | ROCPROFSYS_SAMPLING_CPUS | CPUs to collect frequency informatio... | | ROCPROFSYS_SAMPLING_DELAY | Number of seconds to wait before the... | | ROCPROFSYS_SAMPLING_FREQ | Number of software interrupts per se... | - | ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE_... | + | ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE... | | ROCPROFSYS_SCIENTIFIC | Set the global numerical reporting t... | | ROCPROFSYS_STRICT_CONFIG | Throw errors for unknown setting nam... | | ROCPROFSYS_SUPPRESS_CONFIG | Disable processing of setting config... | @@ -391,13 +385,13 @@ Viewing the setting descriptions | ROCPROFSYS_TRACE | Enable perfetto backend | | ROCPROFSYS_USE_PID | Enable tagging filenames with proces... | | ROCPROFSYS_USE_ROCM_SMI | Enable sampling GPU power, temp, uti... | - | ROCPROFSYS_USE_ROCTRACER | Enable ROCM tracing | + | ROCPROFSYS_USE_ROCM | Enable ROCM tracing | | ROCPROFSYS_USE_SAMPLING | Enable statistical sampling of call-... | | ROCPROFSYS_USE_PROCESS_SAMPLING | Enable a background thread which sam... | | ROCPROFSYS_PROFILE | Enable timemory backend | | ROCPROFSYS_VERBOSE | Verbosity level | | ROCPROFSYS_WIDTH | Set the global output width for comp... | - |-----------------------------------------|-----------------------------------------| + |------------------------------------------|-----------------------------------------| Viewing components ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst b/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst index f8702373e0..3821abd589 100644 --- a/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst +++ b/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst @@ -268,8 +268,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ $ rocprof-sys-sample -- ./parallel-overhead-locks 30 4 100 - HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 - HSA_TOOLS_REPORT_LOAD_FAILURE=1 LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 ROCPROFSYS_USE_PROCESS_SAMPLING=false ROCPROFSYS_USE_SAMPLING=true @@ -283,8 +281,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ $ rocprof-sys-sample -PTDH -I all -- ./parallel-overhead-locks 30 4 100 - HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 - HSA_TOOLS_REPORT_LOAD_FAILURE=1 KOKKOS_PROFILE_LIBRARY=/opt/rocprofiler-systems/lib/librocprof-sys.so.1.7.1 LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 ROCPROFSYS_CPU_FREQ_ENABLED=true @@ -298,9 +294,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_RCCLP=true ROCPROFSYS_USE_ROCM_SMI=true - ROCPROFSYS_USE_ROCPROFILER=true - ROCPROFSYS_USE_ROCTRACER=true - ROCPROFSYS_USE_ROCTX=true + ROCPROFSYS_USE_ROCM=true ROCPROFSYS_USE_SAMPLING=true ROCPROFSYS_PROFILE=true OMP_TOOL_LIBRARIES=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 @@ -330,9 +324,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_RCCLP=false ROCPROFSYS_USE_ROCM_SMI=false - ROCPROFSYS_USE_ROCPROFILER=false - ROCPROFSYS_USE_ROCTRACER=false - ROCPROFSYS_USE_ROCTX=false + ROCPROFSYS_USE_ROCM=false ROCPROFSYS_USE_SAMPLING=true ROCPROFSYS_PROFILE=true ... @@ -363,9 +355,7 @@ Here is the full output from the previous ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_RCCLP=false ROCPROFSYS_USE_ROCM_SMI=false - ROCPROFSYS_USE_ROCPROFILER=false - ROCPROFSYS_USE_ROCTRACER=false - ROCPROFSYS_USE_ROCTX=false + ROCPROFSYS_USE_ROCM=false ROCPROFSYS_USE_SAMPLING=true [rocprof-sys][dl][1785877] rocprofsys_main [rocprof-sys][1785877][rocprofsys_init_tooling] Instrumentation mode: Sampling diff --git a/projects/rocprofiler-systems/docs/install/install.rst b/projects/rocprofiler-systems/docs/install/install.rst index 285f635bcf..dd3c67db0c 100644 --- a/projects/rocprofiler-systems/docs/install/install.rst +++ b/projects/rocprofiler-systems/docs/install/install.rst @@ -241,8 +241,8 @@ Installing ROCm Systems Profiler ----------------------------------- ROCm Systems Profiler has CMake configuration options for MPI support (``ROCPROFSYS_USE_MPI`` or -``ROCPROFSYS_USE_MPI_HEADERS``), HIP kernel tracing (``ROCPROFSYS_USE_ROCTRACER``), -ROCm device sampling (``ROCPROFSYS_USE_ROCM_SMI``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``), +``ROCPROFSYS_USE_MPI_HEADERS``), +ROCm tracing and sampling (``ROCPROFSYS_USE_ROCM``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``), hardware counters via PAPI (``ROCPROFSYS_USE_PAPI``), among other features. Various additional features can be enabled via the ``TIMEMORY_USE_*`` `CMake options `_. @@ -256,22 +256,20 @@ in `the Perfetto UI `_. .. code-block:: shell git clone https://github.com/ROCm/rocprofiler-systems.git rocprof-sys-source - cmake \ - -B rocprof-sys-build \ + cmake \ + -B rocprof-sys-build \ -D CMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \ - -D ROCPROFSYS_USE_HIP=ON \ - -D ROCPROFSYS_USE_ROCM_SMI=ON \ - -D ROCPROFSYS_USE_ROCTRACER=ON \ - -D ROCPROFSYS_USE_PYTHON=ON \ - -D ROCPROFSYS_USE_OMPT=ON \ - -D ROCPROFSYS_USE_MPI_HEADERS=ON \ - -D ROCPROFSYS_BUILD_PAPI=ON \ - -D ROCPROFSYS_BUILD_LIBUNWIND=ON \ - -D ROCPROFSYS_BUILD_DYNINST=ON \ - -D DYNINST_BUILD_TBB=ON \ - -D DYNINST_BUILD_BOOST=ON \ - -D DYNINST_BUILD_ELFUTILS=ON \ - -D DYNINST_BUILD_LIBIBERTY=ON \ + -D ROCPROFSYS_USE_ROCM=ON \ + -D ROCPROFSYS_USE_PYTHON=ON \ + -D ROCPROFSYS_USE_OMPT=ON \ + -D ROCPROFSYS_USE_MPI_HEADERS=ON \ + -D ROCPROFSYS_BUILD_PAPI=ON \ + -D ROCPROFSYS_BUILD_LIBUNWIND=ON \ + -D ROCPROFSYS_BUILD_DYNINST=ON \ + -D DYNINST_BUILD_TBB=ON \ + -D DYNINST_BUILD_BOOST=ON \ + -D DYNINST_BUILD_ELFUTILS=ON \ + -D DYNINST_BUILD_LIBIBERTY=ON \ rocprof-sys-source cmake --build rocprof-sys-build --target all --parallel 8 cmake --build rocprof-sys-build --target install diff --git a/projects/rocprofiler-systems/scripts/build-release.sh b/projects/rocprofiler-systems/scripts/build-release.sh index e8f419aa9a..e95de36e87 100755 --- a/projects/rocprofiler-systems/scripts/build-release.sh +++ b/projects/rocprofiler-systems/scripts/build-release.sh @@ -372,7 +372,7 @@ if [ "${IS_DOCKER}" -ne 0 ]; then git config --global --add safe.directory ${PWD verbose-run echo "Build rocprofiler-systems installers with generators: ${GENERATORS}" -build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=OFF -build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=ON -build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=OFF -build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=ON +build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=OFF +build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=ON +build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=OFF +build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=ON diff --git a/projects/rocprofiler-systems/source/bin/CMakeLists.txt b/projects/rocprofiler-systems/source/bin/CMakeLists.txt index de27963dca..3ac0cd0786 100644 --- a/projects/rocprofiler-systems/source/bin/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/bin/CMakeLists.txt @@ -1,17 +1,8 @@ # executable RPATH -if(ROCPROFSYS_USE_ROCPROFILER - AND rocprofiler_LIBRARY_DIR - AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0 - AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH) - set(ROCPROFSYS_EXE_INSTALL_RPATH - "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}" - ) -else() - set(ROCPROFSYS_EXE_INSTALL_RPATH - "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}" - ) -endif() +set(ROCPROFSYS_EXE_INSTALL_RPATH + "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}" + ) # executables add_subdirectory(rocprof-sys-avail) diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp index 9f5d67a9ac..2d8c1a7562 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp @@ -33,8 +33,7 @@ #include "api.hpp" #include "core/config.hpp" #include "core/gpu.hpp" -#include "core/hip_runtime.hpp" -#include "library/rocprofiler.hpp" +#include "library/rocm.hpp" #include #include @@ -119,7 +118,7 @@ write_hw_counter_info(std::ostream&, const array_t& = {}, namespace { // initialize HIP before main so that librocprof-sys is not HSA_TOOLS_LIB -int gpu_count = rocprofsys::gpu::hip_device_count(); +int gpu_count = rocprofsys::gpu::device_count(); // statically allocated shared_ptrs to prevent use after free errors auto timemory_manager = tim::manager::master_instance(); @@ -508,15 +507,15 @@ main(int argc, char** argv) return EXIT_FAILURE; } -#if ROCPROFSYS_USE_HIP > 0 +#if ROCPROFSYS_USE_ROCM > 0 if(gpu_count > 0) { size_t _num_metrics = 0; try { - // call to rocm_metrics() will add choices to ROCPROFSYS_ROCM_EVENTS setting + // call to rocm_events() will add choices to ROCPROFSYS_ROCM_EVENTS setting // so always perform this call even if list of HW counters is not requested - _num_metrics = rocprofsys::rocprofiler::rocm_metrics().size(); + _num_metrics = rocprofsys::rocm::rocm_events().size(); } catch(std::runtime_error& _e) { verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what()); @@ -615,9 +614,9 @@ main(int argc, char** argv) } } - signal(SIGABRT, &dump_log_abort); - signal(SIGSEGV, &dump_log_abort); - signal(SIGQUIT, &dump_log_abort); + // signal(SIGABRT, &dump_log_abort); + // signal(SIGSEGV, &dump_log_abort); + // signal(SIGQUIT, &dump_log_abort); if(!os) os = &std::cout; @@ -641,6 +640,8 @@ main(int argc, char** argv) } dump_log(); + const_cast&>(tim::settings::shared_instance()).reset(); + return 0; } @@ -1076,7 +1077,7 @@ write_hw_counter_info(std::ostream& os, const array_t& options, auto _papi_events = tim::papi::available_events_info(); auto _rocm_events = - (gpu_count > 0) ? rocprofsys::rocprofiler::rocm_metrics() : hwcounter_info_t{}; + (gpu_count > 0) ? rocprofsys::rocm::rocm_events() : hwcounter_info_t{}; if(alphabetical) { diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp index ac2b7738db..0aa0f9eeb2 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp @@ -339,7 +339,7 @@ generate_config(std::string _config_file, const std::set& _config_f for(const auto* itr : { "ROCPROFSYS_CONFIG", "ROCPROFSYS_MODE", "ROCPROFSYS_TRACE", "ROCPROFSYS_PROFILE", "ROCPROFSYS_USE_SAMPLING", - "ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCTRACER", + "ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCM", "ROCPROFSYS_USE_ROCM_SMI", "ROCPROFSYS_USE_KOKKOSP", "ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", "ROCPROFSYS_OUTPUT" }) { diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/info_type.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/info_type.cpp index 49a85d0657..d7d7d4e494 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/info_type.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/info_type.cpp @@ -29,8 +29,6 @@ #include "library/components/fork_gotcha.hpp" #include "library/components/mpi_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/rocprofiler.hpp" -#include "library/components/roctracer.hpp" #include #include diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp index be0da1ee8d..855b20307d 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp @@ -752,10 +752,6 @@ parse_args(int argc, char** argv, std::vector& _env, parser.end_group(); -#if ROCPROFSYS_HIP_VERSION > 0 && ROCPROFSYS_HIP_VERSION < 50300 - update_env(_env, "HSA_ENABLE_INTERRUPT", 0); -#endif - auto _inpv = std::vector{}; auto _outv = std::vector{}; bool _hash = false; @@ -824,11 +820,6 @@ parse_args(int argc, char** argv, std::vector& _env, add_default_env(_env, "ROCPROFSYS_USE_MPIP", true); #endif -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 - add_default_env(_env, "ROCPROFSYS_ROCTRACER_HIP_API", true); - add_default_env(_env, "ROCPROFSYS_ROCTRACER_HSA_API", true); -#endif - #if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true); #endif diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/CMakeLists.txt b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/CMakeLists.txt index ecacdbb90b..3e32a39363 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/CMakeLists.txt @@ -35,6 +35,8 @@ target_link_libraries( timemory::timemory-extensions timemory::timemory-core) +add_target_flag_if_avail(rocprofiler-systems-instrument "-Wno-deprecated-declarations") + set_target_properties( rocprofiler-systems-instrument PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}" diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/internal_libs.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/internal_libs.cpp index 3bda766905..5d4dd66842 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/internal_libs.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/internal_libs.cpp @@ -312,13 +312,25 @@ get_internal_basic_libs_impl() "liblzma.so" }; // shared libraries used by rocprof-sys - const auto _omni_libs = strview_init_t{ - "libstdc++.so.6", "libgotcha.so", "libunwind-coredump.so", - "libunwind-generic.so", "libunwind-ptrace.so", "libunwind-setjmp.so", - "libunwind.so", "libunwind-x86_64.so", "librocm_smi64.so", - "libroctx64.so", "librocmtools.so", "libroctracer64.so", - "librocprofiler64.so", "libpapi.so", "libpfm.so" - }; + const auto _omni_libs = strview_init_t{ "libstdc++.so.6", + "libgotcha.so", + "libunwind-coredump.so", + "libunwind-generic.so", + "libunwind-ptrace.so", + "libunwind-setjmp.so", + "libunwind.so", + "libunwind-x86_64.so", + "librocm_smi64.so", + "libroctx64.so", + "librocmtools.so", + "libroctracer64.so", + "librocprofiler64.so", + "libpapi.so", + "libpfm.so", + "librocprofiler-register.so", + "librocprofiler-sdk.so", + "librocprofiler-sdk-roctx.so", + "libamd_smi.so" }; // shared libraries potentially used by timemory const auto _3rdparty_libs = strview_init_t{ "libcaliper.so", diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp index 93a2da5ae1..e8ce55473b 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp @@ -357,10 +357,12 @@ main(int argc, char** argv) itr.find("rocprof-sys") != std::string::npos || itr.find("rocprofiler-systems") != std::string::npos || std::regex_search( - itr, std::regex{ "lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|" - "instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|" - "tbbmalloc|tbbmalloc_proxy|gotcha|libunwind|roctracer|" - "hsa-runtime|amdhip|rocm_smi)\\.(so|a)" })) + itr, std::regex{ + "lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|" + "instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|tbbmalloc|" + "tbbmalloc_proxy|gotcha|libunwind|roctracer64|hsa-runtime|amdhip|" + "amd_comgr|rocm_smi64|rocprofiler64|rocprofiler-register|" + "rocprofiler-sdk|rocprofiler-sdk-roctx|amd_smi)\\.(so|a)" })) { if(!find(filepath::dirname(itr), lib_search_paths)) lib_search_paths.emplace_back(filepath::dirname(itr)); diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp index da9da3a5fa..c951455199 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp @@ -44,14 +44,6 @@ #include #include -#if !defined(ROCPROFSYS_USE_ROCTRACER) -# define ROCPROFSYS_USE_ROCTRACER 0 -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) -# define ROCPROFSYS_USE_ROCPROFILER 0 -#endif - namespace color = tim::log::color; using namespace timemory::join; using tim::get_env; @@ -140,17 +132,6 @@ get_initial_environment() update_env(_env, "ROCPROFSYS_USE_SAMPLING", (_mode != "causal")); -#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER) - update_env(_env, "HSA_TOOLS_LIB", _dl_libpath); - if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE")) - update_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1"); -#endif - -#if defined(ROCPROFSYS_USE_ROCPROFILER) - update_env(_env, "ROCP_TOOL_LIB", _omni_libpath); - if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_env, "ROCP_HSA_INTERCEPT", "1"); -#endif - #if defined(ROCPROFSYS_USE_OMPT) if(!getenv("OMP_TOOL_LIBRARIES")) update_env(_env, "OMP_TOOL_LIBRARIES", _dl_libpath, UPD_APPEND); @@ -357,14 +338,6 @@ parse_args(int argc, char** argv, std::vector& _env) %{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance %{INDENT}% 1 do not modify how ROCm is notified about kernel completion)"; - auto _realtime_reqs = (get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty()) - ? std::vector{ "hsa-interrupt" } - : std::vector{}; - -#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0 - _realtime_reqs.clear(); -#endif - const auto* _trace_policy_desc = R"(Policy for new data when the buffer size limit is reached: %{INDENT}%- discard : new data is ignored @@ -720,7 +693,6 @@ parse_args(int argc, char** argv, std::vector& _env) parser.add_argument({ "--realtime" }, _realtime_desc) .min_count(0) - .required(std::move(_realtime_reqs)) .action([&](parser_t& p) { auto _v = p.get>("realtime"); update_env(_env, "ROCPROFSYS_SAMPLING_REALTIME", true); @@ -741,10 +713,20 @@ parse_args(int argc, char** argv, std::vector& _env) } }); - std::set _backend_choices = { "all", "kokkosp", "mpip", - "ompt", "rcclp", "rocm-smi", - "roctracer", "rocprofiler", "roctx", - "mutex-locks", "spin-locks", "rw-locks" }; + std::set _backend_choices = { "all", + "kokkosp", + "mpip", + "ompt", + "rcclp", + "rocm-smi", + "roctracer", + "rocprofiler", + "roctx", + "mutex-locks", + "spin-locks", + "rw-locks", + "rocprofiler-sdk", + "rocm" }; #if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS) _backend_choices.erase("mpip"); @@ -758,17 +740,10 @@ parse_args(int argc, char** argv, std::vector& _env) _backend_choices.erase("rcclp"); #endif -#if !defined(ROCPROFSYS_USE_ROCM_SMI) +#if !defined(ROCPROFSYS_USE_ROCM) + _backend_choices.erase("rocm"); _backend_choices.erase("rocm-smi"); -#endif - -#if !defined(ROCPROFSYS_USE_ROCTRACER) - _backend_choices.erase("roctracer"); - _backend_choices.erase("roctx"); -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) - _backend_choices.erase("rocprofiler"); + _backend_choices.erase("rocprofiler-sdk"); #endif parser.start_group("BACKEND OPTIONS", @@ -784,11 +759,9 @@ parse_args(int argc, char** argv, std::vector& _env) _update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0); _update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0); _update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0); + _update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0); _update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0); - _update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0); _update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0); - _update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0); - _update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0); _update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); @@ -810,27 +783,18 @@ parse_args(int argc, char** argv, std::vector& _env) _update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0); _update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0); _update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0); + _update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0); _update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0); - _update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0); _update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0); - _update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0); - _update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0); _update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); - if(_v.count("all") > 0 || - (_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0)) - { - remove_env(_env, "HSA_TOOLS_LIB"); - remove_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE"); - } - - if(_v.count("all") > 0 || _v.count("rocprofiler") > 0) - { - remove_env(_env, "ROCP_TOOL_LIB"); - remove_env(_env, "ROCP_HSA_INTERCEPT"); - } + // if(_v.count("all") > 0 || _v.count("rocprofiler") > 0) + // { + // remove_env(_env, "ROCP_TOOL_LIB"); + // remove_env(_env, "ROCP_HSA_INTERCEPT"); + // } if(_v.count("all") > 0 || _v.count("ompt") > 0) remove_env(_env, "OMP_TOOL_LIBRARIES"); @@ -850,18 +814,6 @@ parse_args(int argc, char** argv, std::vector& _env) update_env(_env, "ROCPROFSYS_PAPI_EVENTS", _events); }); -#if defined(ROCPROFSYS_USE_ROCPROFILER) - parser - .add_argument({ "-G", "--gpu-events" }, - "Set the GPU hardware counter events to record (ref: " - "`rocprof-sys-avail -H -c GPU`)") - .action([&](parser_t& p) { - auto _events = - join(array_config{ "," }, p.get>("gpu-events")); - update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events); - }); -#endif - parser.start_group("MISCELLANEOUS OPTIONS", ""); parser .add_argument({ "-i", "--inlines" }, diff --git a/projects/rocprofiler-systems/source/lib/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/CMakeLists.txt index 6d93bf7865..13f320d39a 100644 --- a/projects/rocprofiler-systems/source/lib/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/CMakeLists.txt @@ -12,15 +12,7 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) cmake_policy(SET CMP0115 NEW) endif() -if(ROCPROFSYS_USE_ROCPROFILER - AND rocprofiler_LIBRARY_DIR - AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0 - AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH) - set(ROCPROFSYS_LIB_INSTALL_RPATH - "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}") -else() - set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}") -endif() +set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}") # ------------------------------------------------------------------------------# # @@ -50,10 +42,7 @@ target_link_libraries( $ $ $ - $ - $ - $ - $ + $ $ $ $ diff --git a/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt index b9c6f1919d..b4237c9d48 100644 --- a/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt @@ -19,7 +19,9 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/environment.hpp ${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp ${CMAKE_CURRENT_SOURCE_DIR}/join.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp) + ${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/static_object.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/synchronized.hpp) get_filename_component(COMMON_SOURCE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" DIRECTORY) get_filename_component(COMMON_BINARY_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}" DIRECTORY) diff --git a/projects/rocprofiler-systems/source/lib/common/defines.h.in b/projects/rocprofiler-systems/source/lib/common/defines.h.in index b8eff840ad..776a5f105a 100644 --- a/projects/rocprofiler-systems/source/lib/common/defines.h.in +++ b/projects/rocprofiler-systems/source/lib/common/defines.h.in @@ -42,10 +42,10 @@ #define ROCPROFSYS_COMPILER_STRING ROCPROFSYS_COMPILER_ID " v" ROCPROFSYS_COMPILER_VERSION #define ROCPROFSYS_DEFAULT_ROCM_PATH "@ROCmVersion_DIR@" -#define ROCPROFSYS_HIP_VERSION_STRING "@ROCPROFSYS_HIP_VERSION@" -#define ROCPROFSYS_HIP_VERSION_MAJOR @ROCPROFSYS_HIP_VERSION_MAJOR@ -#define ROCPROFSYS_HIP_VERSION_MINOR @ROCPROFSYS_HIP_VERSION_MINOR@ -#define ROCPROFSYS_HIP_VERSION_PATCH @ROCPROFSYS_HIP_VERSION_PATCH@ +#define ROCPROFSYS_ROCM_VERSION_STRING "@ROCPROFSYS_ROCM_VERSION@" +#define ROCPROFSYS_ROCM_VERSION_MAJOR @ROCPROFSYS_ROCM_VERSION_MAJOR@ +#define ROCPROFSYS_ROCM_VERSION_MINOR @ROCPROFSYS_ROCM_VERSION_MINOR@ +#define ROCPROFSYS_ROCM_VERSION_PATCH @ROCPROFSYS_ROCM_VERSION_PATCH@ // these can be set via defining the variable in CMake, e.g.: // cmake -D ROCPROFSYS_CACHELINE_SIZE=N /path/to/source @@ -63,15 +63,15 @@ ((10000 * ROCPROFSYS_VERSION_MAJOR) + (100 * ROCPROFSYS_VERSION_MINOR) + \ ROCPROFSYS_VERSION_PATCH) -#define ROCPROFSYS_HIP_VERSION \ - ((10000 * ROCPROFSYS_HIP_VERSION_MAJOR) + (100 * ROCPROFSYS_HIP_VERSION_MINOR) + \ - ROCPROFSYS_HIP_VERSION_PATCH) +#define ROCPROFSYS_ROCM_VERSION \ + ((10000 * ROCPROFSYS_ROCM_VERSION_MAJOR) + (100 * ROCPROFSYS_ROCM_VERSION_MINOR) + \ + ROCPROFSYS_ROCM_VERSION_PATCH) -#if ROCPROFSYS_HIP_VERSION_MAJOR > 0 -# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING \ - "v@ROCPROFSYS_HIP_VERSION_MAJOR@.@ROCPROFSYS_HIP_VERSION_MINOR@.x" +#if ROCPROFSYS_ROCM_VERSION_MAJOR > 0 +# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \ + "v@ROCPROFSYS_ROCM_VERSION_MAJOR@.@ROCPROFSYS_ROCM_VERSION_MINOR@.x" #else -# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING "" +# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING "" #endif // this should be passed to argparse::argument_parser::enable_version @@ -83,7 +83,7 @@ { \ { "", ROCPROFSYS_LIBRARY_ARCH }, { "compiler", ROCPROFSYS_COMPILER_STRING }, \ { \ - "rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING \ + "rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \ } \ } #endif diff --git a/projects/rocprofiler-systems/source/lib/common/setup.hpp b/projects/rocprofiler-systems/source/lib/common/setup.hpp index 0c2d1ad69a..1cf5e734c6 100644 --- a/projects/rocprofiler-systems/source/lib/common/setup.hpp +++ b/projects/rocprofiler-systems/source/lib/common/setup.hpp @@ -109,148 +109,6 @@ get_environ(int _verbose, std::string _search_paths = {}, _omnilib = common::path::find_path(_omnilib, _verbose, _search_paths); _omnilib_dl = common::path::find_path(_omnilib_dl, _verbose, _search_paths); -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 - _data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 }); -#endif - -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 -# if ROCPROFSYS_HIP_VERSION >= 50200 -# define ROCPROFILER_METRICS_DIR "lib/rocprofiler" -# else -# define ROCPROFILER_METRICS_DIR "rocprofiler/lib" -# endif -# if ROCPROFSYS_HIP_VERSION <= 50500 -# define ROCPROFILER_LIBNAME "librocprofiler64.so" -# else -# define ROCPROFILER_LIBNAME "librocprofiler64.so.1" -# endif - - _data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 }); - _data.emplace_back(env_config{ "ROCP_TOOL_LIB", _omnilib.c_str(), 0 }); - _data.emplace_back(env_config{ "ROCPROFILER_LOG", "1", 0 }); - _data.emplace_back(env_config{ "ROCP_HSA_INTERCEPT", "1", 0 }); - _data.emplace_back(env_config{ "HSA_TOOLS_REPORT_LOAD_FAILURE", "1", 0 }); - - auto _possible_rocp_metrics = std::vector{}; - auto _possible_rocprof_libs = std::vector{}; - for(const auto* itr : { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" }) - { - if(getenv(itr)) - { - _possible_rocp_metrics.emplace_back( - common::join('/', getenv(itr), "lib/rocprofiler")); - _possible_rocprof_libs.emplace_back( - common::join('/', getenv(itr), "lib/rocprofiler", ROCPROFILER_LIBNAME)); - _possible_rocp_metrics.emplace_back( - common::join('/', getenv(itr), "rocprofiler/lib")); - _possible_rocprof_libs.emplace_back( - common::join('/', getenv(itr), "rocprofiler/lib", ROCPROFILER_LIBNAME)); - } - } - - // default path - _possible_rocp_metrics.emplace_back( - common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "lib/rocprofiler")); - _possible_rocp_metrics.emplace_back( - common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "rocprofiler/lib")); - - auto _realpath_and_unique = [](const auto& _inp_v) { - auto _out_v = decltype(_inp_v){}; - for(auto& itr : _inp_v) - { - if(path::exists(itr)) _out_v.emplace_back(path::realpath(itr)); - } - - _out_v.erase(std::unique(_out_v.begin(), _out_v.end()), _out_v.end()); - return _out_v; - }; - - _possible_rocprof_libs = _realpath_and_unique(_possible_rocprof_libs); - - for(const auto& itr : _possible_rocprof_libs) - { - if(path::exists(itr)) - { - _data.emplace_back( - env_config{ "ROCPROFSYS_ROCPROFILER_LIBRARY", itr.c_str(), 0 }); - _possible_rocp_metrics.emplace( - _possible_rocp_metrics.begin(), - common::join('/', path::dirname(itr), "../../lib/rocprofiler")); - _possible_rocp_metrics.emplace(_possible_rocp_metrics.begin(), - common::join('/', path::dirname(itr))); - } - } - - _possible_rocp_metrics = _realpath_and_unique(_possible_rocp_metrics); - - auto _env_rocp_metrics = get_env("ROCP_METRICS", ""); - if(!_env_rocp_metrics.empty()) - { - if(!path::exists(_env_rocp_metrics)) - throw std::runtime_error(join("", "Error! ROCP_METRICS file \"", - _env_rocp_metrics, "\" does not exist")); - _possible_rocp_metrics.clear(); - _possible_rocp_metrics.emplace_back( - common::join('/', path::dirname(_env_rocp_metrics))); - } - - auto _found_rocp_metrics = (!_env_rocp_metrics.empty()) - ? get_env("ROCPROFSYS_ROCP_METRICS_FORCE_VALID", false) - : false; - - if(!_found_rocp_metrics) - { - for(const auto& itr : _possible_rocp_metrics) - { - auto _metrics_path = join('/', itr, "metrics.xml"); - if(path::exists(itr) && path::exists(_metrics_path) && - path::exists(join('/', itr, "gfx_metrics.xml"))) - { - _found_rocp_metrics = true; - _data.emplace_back( - env_config{ "ROCP_METRICS", _metrics_path.c_str(), 0 }); - break; - } - } - } - - // handle error - if(!_found_rocp_metrics) - { - auto _msg = std::stringstream{}; - _msg << std::boolalpha; - if(!_env_rocp_metrics.empty()) - { - auto _env_rocp_metrics_dir = path::dirname(_env_rocp_metrics); - auto _rocp_metrics_xml = join('/', _env_rocp_metrics_dir, "metrics.xml"); - auto _rocp_gfx_metrics_xml = - join('/', _env_rocp_metrics_dir, "gfx_metrics.xml"); - _msg << "Error! ROCP_METRICS=\"" << _env_rocp_metrics - << "\" in the environment but the directory (" << _env_rocp_metrics_dir - << ") does not contain " - "metrics.xml (found: " - << path::exists(_rocp_metrics_xml) << ") and/or gfx_metrics.xml (found: " - << path::exists(_rocp_gfx_metrics_xml) - << "). To ignore this error, set " - "ROCPROFSYS_ROCP_METRICS_FORCE_VALID=true in the environment"; - } - else - { - _msg - << "Error! ROCP_METRICS not set in environment and rocprof-sys could not " - "find a suitable path. Please set ROCP_METRICS=/path/to/metrics.xml " - "in the environment. This file is typically located in the same " - "folder as the librocprofiler64.so library.\nAdditional note: " - "metrics.xml typically contains:\n\t#include " - "\"gfx_metrics.xml\"\nMake sure the provided path also contains this " - "file.\nExample:\n\texport ROCP_METRICS=" - << ROCPROFSYS_DEFAULT_ROCM_PATH << "/" << ROCPROFILER_METRICS_DIR - << "/metrics.xml\n"; - } - throw std::runtime_error(_msg.str()); - } -#endif - #if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0 if(get_env("ROCPROFSYS_USE_OMPT", true)) { diff --git a/projects/rocprofiler-systems/source/lib/common/static_object.hpp b/projects/rocprofiler-systems/source/lib/common/static_object.hpp new file mode 100644 index 0000000000..567ce2c7f6 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/common/static_object.hpp @@ -0,0 +1,207 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofsys +{ +inline namespace common +{ +using static_dtor_func_t = void (*)(); + +void +destroy_static_objects(); + +void +register_static_dtor(static_dtor_func_t&&); + +namespace +{ +struct anonymous +{}; +} // namespace + +struct do_not_destroy +{}; + +template +constexpr size_t +static_buffer_size() +{ + return sizeof(Tp); +} + +/** + * @brief This struct is used to create static singleton objects which have the properties + * of a heap-allocated static object without a memory leak. + * + * @tparam Tp Data type of singleton + * @tparam ContextT Use to differentiate singletons in different translation units (if + * using default parameter) or ensure the singleton can be accessed in different + * translation units (not recommended) as long as this type is not in an anonymous + * namespace + * + * This template works by creating a buffer of at least `sizeof(Tp)` bytes in the binary + * and does a placement new into that buffer. The object created is NOT heap allocated, + * the address of the object is an address in between the library load address and the + * load address + size of library. + */ +template +struct static_object +{ + static_object() = delete; + ~static_object() = delete; + static_object(const static_object&) = delete; + static_object(static_object&&) noexcept = delete; + static_object& operator=(const static_object&) = delete; + static_object& operator=(static_object&&) noexcept = delete; + + template + static Tp*& construct(Args&&... args); + + template + static Tp*& construct(do_not_destroy&&, Args&&... args); + + static Tp* get() { return m_object; } + + static constexpr bool is_trivial_standard_layout(); + +private: + static Tp* m_object; + static std::array()> m_buffer; +}; + +template +Tp* static_object::m_object = nullptr; + +template +std::array()> + static_object::m_buffer = {}; + +template +constexpr bool +static_object::is_trivial_standard_layout() +{ + return (std::is_standard_layout::value && std::is_trivial::value); +} + +template +template +Tp*& +static_object::construct(Args&&... args) +{ + if constexpr(!is_trivial_standard_layout()) + { + static auto _once = std::once_flag{}; + std::call_once(_once, []() { + register_static_dtor([]() { + if(static_object::m_object) + { + static_object::m_object->~Tp(); + static_object::m_object = nullptr; + } + }); + }); + } + + if(m_object) + { + std::cerr + << "reconstructing static object. Use get() function to retrieve pointer" + << std::endl; + abort(); + } + + m_object = new(m_buffer.data()) Tp{ std::forward(args)... }; + return m_object; +} + +template +template +Tp*& +static_object::construct(do_not_destroy&&, Args&&... args) +{ + if(m_object) + { + std::cerr + << "reconstructing static object. Use get() function to retrieve pointer" + << std::endl; + abort(); + } + + m_object = new(m_buffer.data()) Tp{ std::forward(args)... }; + return m_object; +} + +namespace +{ +inline auto*& +get_static_object_stack() +{ + static auto* _v = new std::stack{}; + return _v; +} +} // namespace + +inline void +destroy_static_objects() +{ + static auto _sync = std::mutex{}; + auto _lk = std::unique_lock{ _sync }; + + auto*& _stack = get_static_object_stack(); + if(_stack) + { + while(!_stack->empty()) + { + auto& itr = _stack->top(); + if(itr) itr(); + _stack->pop(); + } + + delete _stack; + _stack = nullptr; + } +} + +inline void +register_static_dtor(static_dtor_func_t&& _func) +{ + static auto _sync = std::mutex{}; + auto _lk = std::unique_lock{ _sync }; + + auto*& _stack = get_static_object_stack(); + if(_stack) + { + _stack->push(_func); + } +} +} // namespace common +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/common/synchronized.hpp b/projects/rocprofiler-systems/source/lib/common/synchronized.hpp new file mode 100644 index 0000000000..99b4aa1e5b --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/common/synchronized.hpp @@ -0,0 +1,167 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include + +namespace rocprofsys +{ +inline namespace common +{ +/** + * Sychronized is a wrapper that adds lock based write/read + * protection around a datatype. The protected data is accessed + * only by rlock/wlock. rlock(lambda) gets a reader lock of the + * protected value, passing the protected value to the lambda as a + * const. wlock(lambda) gets a writer lock on the protective value + * and does the same. The reason for this class is to make it less + * error prone to access shared data and more obvious when a lock + * is being held. + * + * Example usage: + * + * synchronized x(9); + * x.rlock([](const auto& data){ + * // data = 9 + * }); + * + * x.wlock([](auto& data){ + * // set data to new value + * }); + */ +template +class synchronized +{ +public: + using value_type = LockedType; + using this_type = synchronized; + + synchronized() = default; + ~synchronized() = default; + + explicit synchronized(value_type&& data) + : m_data{ std::move(data) } + {} + + synchronized(synchronized&& data) noexcept = default; + synchronized& operator=(synchronized&& data) noexcept = default; + + // Do not allow this data structure to be copied, std::move only. + synchronized(const synchronized&) = delete; + + template + decltype(auto) rlock(FuncT&& lambda, Args&&... args) const; + + template + decltype(auto) wlock(FuncT&& lambda, Args&&... args); + + // This overload to wlock allows a synchronized map whose keys map to synchronized + // data to use a read lock on the key data and then a write lock on the mapped data. + template = 0> + decltype(auto) wlock(FuncT&& lambda, Args&&... args) const; + + // Upgradable lock. If read returns false, write will be called with a unique_lock. + // Essentially a helper function that does .rlock() followed by .wlock(). + template + bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args); + +private: + mutable std::shared_mutex m_mutex = {}; + value_type m_data = {}; +}; + +// +// member definitions +// +template +template +decltype(auto) +synchronized::rlock(FuncT&& lambda, Args&&... args) const +{ + static_assert(std::is_invocable::value, + "function must accept const reference to locked type"); + + auto lock = std::shared_lock{ m_mutex }; + return std::forward(lambda)(m_data, std::forward(args)...); +} + +template +template +decltype(auto) +synchronized::wlock(FuncT&& lambda, Args&&... args) +{ + static_assert(std::is_invocable::value, + "function must accept reference to locked type"); + + auto lock = std::unique_lock{ m_mutex }; + return std::forward(lambda)(m_data, std::forward(args)...); +} + +// This overload to wlock allows a synchronized map whose keys map to synchronized data to +// use a read lock on the key data and then a write lock on the mapped data. +template +template > +decltype(auto) +synchronized::wlock(FuncT&& lambda, Args&&... args) const +{ + return const_cast(this)->wlock(std::forward(lambda), + std::forward(args)...); +} + +// Upgradable lock. If read returns false, write will be called with a unique_lock. +// Essentially a helper function that does .rlock() followed by .wlock(). +template +template +bool +synchronized::ulock(ReadFuncT&& read, WriteFuncT&& write, + Args&&... args) +{ + static_assert(std::is_invocable::value, + "read function must accept const reference to locked type"); + static_assert(std::is_invocable::value, + "write function must accept reference to locked type"); + + using read_return_type = std::invoke_result_t; + using write_return_type = std::invoke_result_t; + + static_assert(std::is_same::value, + "read and write functions must return same type"); + static_assert(std::is_same::value, + "read/write functions must return bool"); + + { + auto lock = std::shared_lock{ m_mutex }; + if(read(m_data, std::forward(args)...)) return true; + } + + auto lock = std::unique_lock{ m_mutex }; + return write(m_data, std::forward(args)...); +} +} // namespace common +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt index 6e3b534163..184229642c 100644 --- a/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt @@ -14,6 +14,7 @@ set(core_sources ${CMAKE_CURRENT_LIST_DIR}/mproc.cpp ${CMAKE_CURRENT_LIST_DIR}/perf.cpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.cpp + ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp ${CMAKE_CURRENT_LIST_DIR}/state.cpp ${CMAKE_CURRENT_LIST_DIR}/timemory.cpp ${CMAKE_CURRENT_LIST_DIR}/utility.cpp) @@ -29,13 +30,13 @@ set(core_headers ${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp ${CMAKE_CURRENT_LIST_DIR}/exception.hpp ${CMAKE_CURRENT_LIST_DIR}/gpu.hpp - ${CMAKE_CURRENT_LIST_DIR}/hip_runtime.hpp ${CMAKE_CURRENT_LIST_DIR}/locking.hpp ${CMAKE_CURRENT_LIST_DIR}/mproc.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp ${CMAKE_CURRENT_LIST_DIR}/redirect.hpp + ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp ${CMAKE_CURRENT_LIST_DIR}/timemory.hpp ${CMAKE_CURRENT_LIST_DIR}/utility.hpp) @@ -54,6 +55,10 @@ add_subdirectory(containers) target_include_directories(rocprofiler-systems-core-library BEFORE PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_include_directories( + rocprofiler-systems-core-library + PRIVATE ${PROJECT_SOURCE_DIR}/external/timemory/source/timemory/tpls/cereal) + target_link_libraries(rocprofiler-systems-core-library PRIVATE rocprofiler-systems::rocprofiler-systems-interface-library) target_link_libraries( @@ -67,8 +72,7 @@ target_link_libraries( $ $ $ - $ - $ + $ $ $ $ diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.cpp b/projects/rocprofiler-systems/source/lib/core/argparse.cpp index ee195cb438..01c0d229f0 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.cpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.cpp @@ -222,17 +222,6 @@ init_parser(parser_data& _data) _data.dl_libpath = get_realpath(get_internal_libpath("librocprof-sys-dl.so").c_str()); _data.omni_libpath = get_realpath(get_internal_libpath("librocprof-sys.so").c_str()); -#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER) - update_env(_data, "HSA_TOOLS_LIB", _data.dl_libpath); - if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE")) - update_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1"); -#endif - -#if defined(ROCPROFSYS_USE_ROCPROFILER) - update_env(_data, "ROCP_TOOL_LIB", _data.omni_libpath); - if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_data, "ROCP_HSA_INTERCEPT", "1"); -#endif - #if defined(ROCPROFSYS_USE_OMPT) if(!getenv("OMP_TOOL_LIBRARIES")) update_env(_data, "OMP_TOOL_LIBRARIES", _data.dl_libpath, UPD_PREPEND); @@ -300,15 +289,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data) %{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance %{INDENT}% 1 do not modify how ROCm is notified about kernel completion)"; - auto _realtime_reqs = - (tim::get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty()) - ? strvec_t{ "hsa-interrupt" } - : strvec_t{}; - -#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0 - _realtime_reqs.clear(); -#endif - const auto* _trace_policy_desc = R"(Policy for new data when the buffer size limit is reached: %{INDENT}%- discard : new data is ignored @@ -579,45 +559,29 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _backend_choices.erase("rcclp"); #endif -#if !defined(ROCPROFSYS_USE_ROCM_SMI) +#if !defined(ROCPROFSYS_USE_ROCM) + _backend_choices.erase("amd-smi"); _backend_choices.erase("rocm-smi"); -#endif - -#if !defined(ROCPROFSYS_USE_ROCTRACER) - _backend_choices.erase("roctracer"); - _backend_choices.erase("roctx"); -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) - _backend_choices.erase("rocprofiler"); + _backend_choices.erase("rocprofiler-sdk"); + _backend_choices.erase("rocm"); #endif if(gpu::device_count() == 0) { + // remove GPU-specific backends _backend_choices.erase("rcclp"); + _backend_choices.erase("amd-smi"); _backend_choices.erase("rocm-smi"); - _backend_choices.erase("roctracer"); - _backend_choices.erase("rocprofiler"); + _backend_choices.erase("rocprofiler-sdk"); + _backend_choices.erase("rocm"); #if defined(ROCPROFSYS_USE_RCCL) update_env(_data, "ROCPROFSYS_USE_RCCLP", false); #endif -#if defined(ROCPROFSYS_USE_ROCM_SMI) +#if defined(ROCPROFSYS_USE_ROCM) update_env(_data, "ROCPROFSYS_USE_ROCM_SMI", false); -#endif - -#if defined(ROCPROFSYS_USE_ROCTRACER) - update_env(_data, "ROCPROFSYS_USE_ROCTRACER", false); - update_env(_data, "ROCPROFSYS_USE_ROCTX", false); - update_env(_data, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY", false); - update_env(_data, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY", false); - _backend_choices.erase("roctracer"); - _backend_choices.erase("roctx"); -#endif - -#if defined(ROCPROFSYS_USE_ROCPROFILER) - update_env(_data, "ROCPROFSYS_USE_ROCPROFILER", false); + update_env(_data, "ROCPROFSYS_USE_ROCM", false); #endif } @@ -640,11 +604,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0); _update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0); _update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0); + _update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0); _update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0); - _update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0); _update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0); - _update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0); - _update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0); _update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); @@ -676,28 +638,13 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0); _update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0); _update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0); + _update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0); _update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0); - _update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0); _update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0); - _update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0); - _update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0); _update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); _update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); - if(_v.count("all") > 0 || - (_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0)) - { - remove_env(_data, "HSA_TOOLS_LIB"); - remove_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE"); - } - - if(_v.count("all") > 0 || _v.count("rocprofiler") > 0) - { - remove_env(_data, "ROCP_TOOL_LIB"); - remove_env(_data, "ROCP_HSA_INTERCEPT"); - } - if(_v.count("all") > 0 || _v.count("ompt") > 0) remove_env(_data, "OMP_TOOL_LIBRARIES"); @@ -1126,7 +1073,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _parser.add_argument({ "--sample-realtime" }, _realtime_desc) .min_count(0) .dtype("[freq] [delay] [tids...]") - .required(std::move(_realtime_reqs)) .action([&](parser_t& p) { auto _v = p.get>("sample-realtime"); update_env(_data, "ROCPROFSYS_SAMPLING_REALTIME", true); @@ -1210,25 +1156,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("papi_events"); } -#if defined(ROCPROFSYS_USE_ROCPROFILER) - if(_data.environ_filter("gpu_events", _data)) - { - _parser - .add_argument({ "-G", "--gpu-events" }, - "Set the GPU hardware counter events to record (ref: " - "`rocprof-sys-avail -H -c GPU`)") - .min_count(1) - .dtype("[EVENT ...]") - .action([&](parser_t& p) { - auto _events = join(array_config_t{ "," }, p.get("gpu-events")); - update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events); - }); - - _data.processed_environs.emplace("gpu_events"); - _data.processed_environs.emplace("rocm_events"); - } -#endif - add_group_arguments(_parser, "category", _data, true); add_group_arguments(_parser, "io", _data, true); add_group_arguments(_parser, "perfetto", _data, true); diff --git a/projects/rocprofiler-systems/source/lib/core/categories.hpp b/projects/rocprofiler-systems/source/lib/core/categories.hpp index 5eb633db23..0f09f4f1b4 100644 --- a/projects/rocprofiler-systems/source/lib/core/categories.hpp +++ b/projects/rocprofiler-systems/source/lib/core/categories.hpp @@ -91,19 +91,21 @@ ROCPROFSYS_DEFINE_CATEGORY(project, rocprofsys, ROCPROFSYS_CATEGORY_NONE, "rocpr ROCPROFSYS_DEFINE_CATEGORY(category, host, ROCPROFSYS_CATEGORY_HOST, "host", "Host-side function tracing") ROCPROFSYS_DEFINE_CATEGORY(category, user, ROCPROFSYS_CATEGORY_USER, "user", "User-defined regions") ROCPROFSYS_DEFINE_CATEGORY(category, python, ROCPROFSYS_CATEGORY_PYTHON, "python", "Python regions") -ROCPROFSYS_DEFINE_CATEGORY(category, device_hip, ROCPROFSYS_CATEGORY_DEVICE_HIP, "device_hip", "Device-side functions submitted via HIP API") -ROCPROFSYS_DEFINE_CATEGORY(category, device_hsa, ROCPROFSYS_CATEGORY_DEVICE_HSA, "device_hsa", "Device-side functions submitted via HSA API") -ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip, ROCPROFSYS_CATEGORY_ROCM_HIP, "rocm_hip", "Host-side HIP functions") -ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa, ROCPROFSYS_CATEGORY_ROCM_HSA, "rocm_hsa", "Host-side HSA functions") -ROCPROFSYS_DEFINE_CATEGORY(category, rocm_roctx, ROCPROFSYS_CATEGORY_ROCM_ROCTX, "rocm_roctx", "ROCTx labels") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm, ROCPROFSYS_CATEGORY_ROCM, "rocm", "General ROCm tracing") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip_api, ROCPROFSYS_CATEGORY_ROCM_HIP_API, "rocm_hip_api", "ROCm HIP functions") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa_api, ROCPROFSYS_CATEGORY_ROCM_HSA_API, "rocm_hsa_api", "ROCm HSA functions") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_kernel_dispatch, ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, "rocm_kernel_dispatch", "ROCm Kernel dispatch") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_memory_copy, ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, "rocm_memory_copy", "ROCm Async Memory Copy") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_scratch_memory, ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, "rocm_scratch_memory", "ROCm kernel scratch memory reallocations") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_page_migration, ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, "rocm_page_migration", "ROCm memory page migration") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, "rocm_counter_collection", "ROCm device counter collection") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi, ROCPROFSYS_CATEGORY_ROCM_SMI, "rocm_smi", "rocm-smi data") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY, "device_busy", "Busy percentage of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions") -ROCPROFSYS_DEFINE_CATEGORY(category, roctracer, ROCPROFSYS_CATEGORY_ROCTRACER, "roctracer", "Kernel tracing provided by roctracer") -ROCPROFSYS_DEFINE_CATEGORY(category, rocprofiler, ROCPROFSYS_CATEGORY_ROCPROFILER, "rocprofiler", "HW counter data provided by rocprofiler") ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions") ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions") ROCPROFSYS_DEFINE_CATEGORY(category, mpi, ROCPROFSYS_CATEGORY_MPI, "mpi", "MPI regions") @@ -151,19 +153,21 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::user), \ ROCPROFSYS_PERFETTO_CATEGORY(category::python), \ ROCPROFSYS_PERFETTO_CATEGORY(category::sampling), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::device_hip), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::device_hsa), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_roctx), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip_api), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa_api), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_kernel_dispatch), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_memory_copy), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_scratch_memory), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_page_migration), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_counter_collection), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_busy), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::roctracer), \ - ROCPROFSYS_PERFETTO_CATEGORY(category::rocprofiler), \ ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \ ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \ ROCPROFSYS_PERFETTO_CATEGORY(category::mpi), \ diff --git a/projects/rocprofiler-systems/source/lib/core/components/fwd.hpp b/projects/rocprofiler-systems/source/lib/core/components/fwd.hpp index b5726086cf..8e9343d9d2 100644 --- a/projects/rocprofiler-systems/source/lib/core/components/fwd.hpp +++ b/projects/rocprofiler-systems/source/lib/core/components/fwd.hpp @@ -96,14 +96,6 @@ struct functors; } // namespace component } // namespace rocprofsys -#if !defined(ROCPROFSYS_USE_ROCTRACER) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) -#endif - #if !defined(ROCPROFSYS_USE_RCCL) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, category::rocm_rccl, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) @@ -124,7 +116,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_cpu_clock, fa ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_percent, false_type) #endif -#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM_SMI) +#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type) diff --git a/projects/rocprofiler-systems/source/lib/core/config.cpp b/projects/rocprofiler-systems/source/lib/core/config.cpp index 38fbc4e5ed..b5f249d4ca 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.cpp +++ b/projects/rocprofiler-systems/source/lib/core/config.cpp @@ -22,6 +22,7 @@ #include "config.hpp" #include "common/defines.h" +#include "common/static_object.hpp" #include "constraint.hpp" #include "debug.hpp" #include "defines.hpp" @@ -29,9 +30,9 @@ #include "mproc.hpp" #include "perf.hpp" #include "perfetto.hpp" +#include "rocprofiler-sdk.hpp" #include "utility.hpp" -#include #include #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include #include @@ -60,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +70,7 @@ #include #include #include +#include #include #include @@ -76,6 +80,11 @@ using settings = tim::settings; namespace { +int verbose_value = tim::get_env("ROCPROFSYS_VERBOSE", 0, false); +bool debug_value = tim::get_env("ROCPROFSYS_DEBUG", false, false); +bool is_ci_value = tim::get_env("ROCPROFSYS_CI", false, false); +auto configure_once = std::once_flag{}; + TIMEMORY_NOINLINE bool& _settings_are_configured() { @@ -83,6 +92,14 @@ _settings_are_configured() return _v; } +auto*& +get_config_impl() +{ + static auto*& _v = common::static_object>::construct( + common::do_not_destroy{}, settings::shared_instance()); + return _v; +} + auto get_config() { @@ -97,7 +114,7 @@ get_config() std::string get_setting_name(std::string _v) { - static const auto _prefix = tim::string_view_t{ "rocprofsys_" }; + constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" }; for(auto& itr : _v) itr = tolower(itr); auto _pos = _v.find(_prefix); @@ -195,7 +212,7 @@ configure_settings(bool _init) if(settings_are_configured()) return; - if(get_is_continuous_integration() && get_state() < State::Init) + if(is_ci_value && get_state() < State::Init) { timemory_print_demangled_backtrace<64>(); ROCPROFSYS_THROW("config::configure_settings() called before " @@ -220,17 +237,17 @@ configure_settings(bool _init) tim::manager::add_metadata("ROCPROFSYS_COMPILER_VERSION", ROCPROFSYS_COMPILER_VERSION); -#if ROCPROFSYS_HIP_VERSION > 0 - tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION", ROCPROFSYS_HIP_VERSION_STRING); - tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MAJOR", - ROCPROFSYS_HIP_VERSION_MAJOR); - tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MINOR", - ROCPROFSYS_HIP_VERSION_MINOR); - tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_PATCH", - ROCPROFSYS_HIP_VERSION_PATCH); +#if ROCPROFSYS_ROCM_VERSION > 0 + tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION", ROCPROFSYS_ROCM_VERSION_STRING); + tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MAJOR", + ROCPROFSYS_ROCM_VERSION_MAJOR); + tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MINOR", + ROCPROFSYS_ROCM_VERSION_MINOR); + tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_PATCH", + ROCPROFSYS_ROCM_VERSION_PATCH); #endif - auto _config = settings::shared_instance(); + auto _config = *get_config_impl(); // if using timemory, default to perfetto being off auto _default_perfetto_v = !tim::get_env("ROCPROFSYS_PROFILE", false, false); @@ -294,24 +311,15 @@ configure_settings(bool _init) "Enable causal profiling analysis", false, "backend", "causal", "analysis"); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCTRACER", + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM", "Enable ROCm API and kernel tracing", true, "backend", - "roctracer", "rocm"); - - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCPROFILER", - "Enable ROCm hardware counters", true, "backend", - "rocprofiler", "rocm"); + "rocm"); ROCPROFSYS_CONFIG_SETTING( bool, "ROCPROFSYS_USE_ROCM_SMI", "Enable sampling GPU power, temp, utilization, and memory usage", true, "backend", "rocm_smi", "rocm", "process_sampling"); - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_USE_ROCTX", - "Enable ROCtx API. Warning! Out-of-order ranges may corrupt perfetto flamegraph", - false, "backend", "roctracer", "rocm", "roctx"); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_SAMPLING", "Enable statistical sampling of call-stack", false, "backend", "sampling"); @@ -616,41 +624,7 @@ configure_settings(bool _init) "sampling", "hardware_counters") ->set_choices(perf::get_config_choices()); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_API", - "Enable HIP API tracing support", true, "roctracer", "rocm", - "advanced"); - - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE", - "Enable annotating the perfetto debug annotation with backtraces", false, - "roctracer", "rocm", "perfetto", "advanced"); - - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY", - "Enable HIP activity tracing support", true, "roctracer", - "rocm", "advanced"); - - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY", - "Enable HSA activity tracing support", false, "roctracer", - "rocm", "advanced"); - - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_API", - "Enable HSA API tracing support", false, "roctracer", - "rocm", "advanced"); - - ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCTRACER_HSA_API_TYPES", - "HSA API type to collect", "", "roctracer", "rocm", - "advanced"); - - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS", - "Skip barrier marker events in traces", false, "roctracer", - "rocm", "advanced"); - - ROCPROFSYS_CONFIG_SETTING( - std::string, "ROCPROFSYS_ROCM_EVENTS", - "ROCm hardware counters. Use ':device=N' syntax to specify collection on device " - "number N, e.g. ':device=0'. If no device specification is provided, the event " - "is collected on every available device", - "", "rocprofiler", "rocm", "hardware_counters"); + rocprofiler_sdk::config_settings(_config); ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS", "rocm-smi metrics to collect: busy, temp, power, mem_usage", @@ -670,12 +644,6 @@ configure_settings(bool _init) "default to the value of ROCPROFSYS_COLLAPSE_PROCESSES", false, "perfetto", "data", "advanced"); - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM", - "Separate roctracer GPU side traces (copies, kernels) into separate " - "tracks based on the stream they're enqueued into", - true, "perfetto", "roctracer", "rocm", "advanced"); - ROCPROFSYS_CONFIG_SETTING( std::string, "ROCPROFSYS_PERFETTO_FILL_POLICY", "Behavior when perfetto buffer is full. 'discard' will ignore new entries, " @@ -704,18 +672,6 @@ configure_settings(bool _init) "feature may dramatically reduce the size of the trace", true, "perfetto", "data", "debugging", "advanced"); - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS", - "When PERFETTO_ANNOTATIONS, USE_ROCTRACER, and ROCTRACER_HIP_API are all " - "enabled, enabling this option will result in the arg information for HIP API " - "calls to all be within one annotation (e.g., args=\"stream=0x0, dst=0x1F, " - "sizeBytes=64, src=0x08, kind=1\"). When disabled, each parameter will be an " - "individual annotation (e.g. stream, dst, sizeBytes, etc.). The benefit of the " - "former is that it is faster to serialize and consumes less file space; the " - "benefit of the latter is that it becomes much easier to find slices in the " - "trace with the same value", - false, "perfetto", "data", "debugging", "roctracer", "rocm", "advanced"); - ROCPROFSYS_CONFIG_SETTING( uint64_t, "ROCPROFSYS_THREAD_POOL_SIZE", "Max number of threads for processing background tasks", @@ -1045,6 +1001,10 @@ configure_settings(bool _init) settings::suppress_config() = true; + if(auto opt = get_setting_value("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt; + if(auto opt = get_setting_value("ROCPROFSYS_DEBUG"); opt) debug_value = *opt; + if(auto opt = get_setting_value("ROCPROFSYS_CI"); opt) is_ci_value = *opt; + if(get_env("ROCPROFSYS_MONOCHROME", _config->get("ROCPROFSYS_MONOCHROME"))) tim::log::monochrome() = true; @@ -1106,6 +1066,10 @@ configure_settings(bool _init) ROCPROFSYS_BASIC_VERBOSE(2, "configuration complete\n"); + if(auto opt = get_setting_value("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt; + if(auto opt = get_setting_value("ROCPROFSYS_DEBUG"); opt) debug_value = *opt; + if(auto opt = get_setting_value("ROCPROFSYS_CI"); opt) is_ci_value = *opt; + _settings_are_configured() = true; } @@ -1140,8 +1104,6 @@ configure_mode_settings(const std::shared_ptr& _config) _set("ROCPROFSYS_PROFILE", false); _set("ROCPROFSYS_USE_CAUSAL", false); _set("ROCPROFSYS_USE_ROCM_SMI", false); - _set("ROCPROFSYS_USE_ROCTRACER", false); - _set("ROCPROFSYS_USE_ROCPROFILER", false); _set("ROCPROFSYS_USE_KOKKOSP", false); _set("ROCPROFSYS_USE_RCCLP", false); _set("ROCPROFSYS_USE_OMPT", false); @@ -1164,12 +1126,11 @@ configure_mode_settings(const std::shared_ptr& _config) if(gpu::device_count() == 0) { -#if ROCPROFSYS_HIP_VERSION > 0 - ROCPROFSYS_BASIC_VERBOSE(1, "No HIP devices were found: disabling roctracer, " - "rocprofiler, and rocm_smi...\n"); +#if ROCPROFSYS_ROCM_VERSION > 0 + ROCPROFSYS_BASIC_VERBOSE( + 1, "No ROCm devices were found: disabling rocm and rocm_smi...\n"); #endif - _set("ROCPROFSYS_USE_ROCPROFILER", false); - _set("ROCPROFSYS_USE_ROCTRACER", false); + _set("ROCPROFSYS_USE_ROCM", false); _set("ROCPROFSYS_USE_ROCM_SMI", false); } @@ -1202,9 +1163,8 @@ configure_mode_settings(const std::shared_ptr& _config) _set("ROCPROFSYS_USE_TRACE", false); _set("ROCPROFSYS_PROFILE", false); _set("ROCPROFSYS_USE_CAUSAL", false); + _set("ROCPROFSYS_USE_ROCM", false); _set("ROCPROFSYS_USE_ROCM_SMI", false); - _set("ROCPROFSYS_USE_ROCTRACER", false); - _set("ROCPROFSYS_USE_ROCPROFILER", false); _set("ROCPROFSYS_USE_KOKKOSP", false); _set("ROCPROFSYS_USE_RCCLP", false); _set("ROCPROFSYS_USE_OMPT", false); @@ -1389,22 +1349,9 @@ configure_disabled_settings(const std::shared_ptr& _config) _handle_use_option("ROCPROFSYS_USE_OMPT", "ompt"); _handle_use_option("ROCPROFSYS_USE_RCCLP", "rcclp"); _handle_use_option("ROCPROFSYS_USE_ROCM_SMI", "rocm_smi"); - _handle_use_option("ROCPROFSYS_USE_ROCTRACER", "roctracer"); - _handle_use_option("ROCPROFSYS_USE_ROCPROFILER", "rocprofiler"); + _handle_use_option("ROCPROFSYS_USE_ROCM", "rocm"); -#if !defined(ROCPROFSYS_USE_ROCTRACER) || ROCPROFSYS_USE_ROCTRACER == 0 - _config->find("ROCPROFSYS_USE_ROCTRACER")->second->set_hidden(true); - for(const auto& itr : _config->disable_category("roctracer")) - _config->find(itr)->second->set_hidden(true); -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0 - _config->find("ROCPROFSYS_USE_ROCPROFILER")->second->set_hidden(true); - for(const auto& itr : _config->disable_category("rocprofiler")) - _config->find(itr)->second->set_hidden(true); -#endif - -#if !defined(ROCPROFSYS_USE_ROCM_SMI) || ROCPROFSYS_USE_ROCM_SMI == 0 +#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0 _config->find("ROCPROFSYS_USE_ROCM_SMI")->second->set_hidden(true); for(const auto& itr : _config->disable_category("rocm_smi")) _config->find(itr)->second->set_hidden(true); @@ -1567,7 +1514,7 @@ print_banner(std::ostream& _os) { "tag", ROCPROFSYS_GIT_DESCRIBE }, { "", ROCPROFSYS_LIBRARY_ARCH }, { "compiler", ROCPROFSYS_COMPILER_STRING }, - { "rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING } }); + { "rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING } }); // () if(!_properties.empty()) @@ -1797,10 +1744,7 @@ get_debug_env() bool get_is_continuous_integration() { - if(!settings_are_configured()) - return tim::get_env("ROCPROFSYS_CI", false, false); - static auto _v = get_config()->find("ROCPROFSYS_CI"); - return static_cast&>(*_v->second).get(); + return is_ci_value; } bool @@ -1818,8 +1762,8 @@ get_debug_finalize() bool get_debug() { - static auto _v = get_config()->find("ROCPROFSYS_DEBUG"); - return static_cast&>(*_v->second).get(); + std::call_once(configure_once, []() { (void) get_config(); }); + return debug_value; } bool @@ -1842,15 +1786,15 @@ get_verbose_env() int get_verbose() { - static auto _v = get_config()->find("ROCPROFSYS_VERBOSE"); - return static_cast&>(*_v->second).get(); + std::call_once(configure_once, []() { (void) get_config(); }); + return verbose_value; } bool& get_use_perfetto() { - static auto _v = get_config()->find("ROCPROFSYS_TRACE"); - return static_cast&>(*_v->second).get(); + static auto _v = get_config()->at("ROCPROFSYS_TRACE"); + return static_cast&>(*_v).get(); } bool& @@ -1867,43 +1811,10 @@ get_use_causal() return static_cast&>(*_v->second).get(); } -bool -get_use_roctracer() -{ -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 - static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTRACER"); - return static_cast&>(*_v->second).get(); -#else - return false; -#endif -} - -bool -get_perfetto_roctracer_per_stream() -{ -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 - static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM"); - return static_cast&>(*_v->second).get(); -#else - return false; -#endif -} - -bool -get_use_rocprofiler() -{ -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 - static auto _v = get_config()->find("ROCPROFSYS_USE_ROCPROFILER"); - return static_cast&>(*_v->second).get(); -#else - return false; -#endif -} - bool get_use_rocm_smi() { -#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0 +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 static auto _v = get_config()->find("ROCPROFSYS_USE_ROCM_SMI"); return static_cast&>(*_v->second).get(); #else @@ -1911,17 +1822,6 @@ get_use_rocm_smi() #endif } -bool -get_use_roctx() -{ -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 - static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTX"); - return static_cast&>(*_v->second).get(); -#else - return false; -#endif -} - bool& get_use_sampling() { @@ -2031,34 +1931,6 @@ get_sampling_cputime_signal() return static_cast&>(*_v->second).get(); } -bool -get_trace_hip_api() -{ - static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_API"); - return static_cast&>(*_v->second).get(); -} - -bool -get_trace_hip_activity() -{ - static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_ACTIVITY"); - return static_cast&>(*_v->second).get(); -} - -bool -get_trace_hsa_api() -{ - static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_API"); - return static_cast&>(*_v->second).get(); -} - -bool -get_trace_hsa_activity() -{ - static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_ACTIVITY"); - return static_cast&>(*_v->second).get(); -} - size_t get_perfetto_shmem_size_hint() { @@ -2176,14 +2048,6 @@ get_thread_pool_size() return _v; } -std::string -get_trace_hsa_api_types() -{ - static std::string _v = - get_config()->get("ROCPROFSYS_ROCTRACER_HSA_API_TYPES"); - return _v; -} - std::string& get_perfetto_backend() { @@ -2360,7 +2224,7 @@ get_process_sampling_duration() std::string get_sampling_gpus() { -#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0 +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 static auto _v = get_config()->find("ROCPROFSYS_SAMPLING_GPUS"); return static_cast&>(*_v->second).get(); #else @@ -2375,13 +2239,6 @@ get_trace_thread_locks() return static_cast&>(*_v->second).get(); } -std::string -get_rocm_events() -{ - static auto _v = get_config()->find("ROCPROFSYS_ROCM_EVENTS"); - return static_cast&>(*_v->second).get(); -} - bool get_trace_thread_rwlocks() { diff --git a/projects/rocprofiler-systems/source/lib/core/config.hpp b/projects/rocprofiler-systems/source/lib/core/config.hpp index 2ccbce234a..609ca9816d 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.hpp +++ b/projects/rocprofiler-systems/source/lib/core/config.hpp @@ -101,17 +101,22 @@ get_exe_realpath(); template bool -set_setting_value(const std::string& _name, Tp&& _v) +set_setting_value(const std::string& _name, Tp&& _v, + settings::update_type _upd = settings::update_type::user) { - auto _user_upd = tim::settings::update_type::user; - auto _instance = tim::settings::shared_instance(); - auto _setting = _instance->find(_name); + auto* _instance = tim::settings::instance(); + if(!_instance) return false; + + auto _setting = _instance->find(_name); if(_setting == _instance->end()) return false; if(!_setting->second) return false; + auto& itr = _setting->second; - auto _upd = itr->set_user_updated(); - auto _success = itr->set(std::forward(_v), _user_upd); - if(!_success) itr->set_updated(_upd); + auto _old_upd = itr->get_updated_type(); + + auto _success = itr->set(std::forward(_v), _upd); + if(!_success) itr->set_updated(_old_upd); + return _success; } @@ -119,10 +124,13 @@ template bool set_default_setting_value(const std::string& _name, Tp&& _v) { - auto _instance = tim::settings::shared_instance(); - auto _setting = _instance->find(_name); + auto* _instance = tim::settings::instance(); + if(!_instance) return false; + + auto _setting = _instance->find(_name); if(_setting == _instance->end()) return false; if(!_setting->second) return false; + if(_setting->second->get_config_updated() || _setting->second->get_environ_updated()) return false; return _setting->second->set(std::forward(_v)); @@ -132,10 +140,12 @@ template std::optional get_setting_value(const std::string& _name) { - auto _instance = tim::settings::shared_instance(); - if(!_instance) return std::optional{}; + auto* _instance = tim::settings::instance(); + if(!_instance) return std::nullopt; + auto _setting = _instance->find(_name); if(_setting == _instance->end() || !_setting->second) return std::optional{}; + auto&& _ret = _setting->second->get(); return (_ret.first) ? std::optional{ _ret.second } : std::optional{}; } @@ -194,18 +204,9 @@ get_use_timemory() ROCPROFSYS_HOT; bool& get_use_causal() ROCPROFSYS_HOT; -bool -get_use_roctracer() ROCPROFSYS_HOT; - -bool -get_use_rocprofiler() ROCPROFSYS_HOT; - bool get_use_rocm_smi() ROCPROFSYS_HOT; -bool -get_use_roctx(); - bool& get_use_sampling() ROCPROFSYS_HOT; @@ -236,18 +237,6 @@ get_sampling_keep_internal(); bool get_use_rcclp(); -bool -get_trace_hip_api(); - -bool -get_trace_hip_activity(); - -bool -get_trace_hsa_api(); - -bool -get_trace_hsa_activity(); - size_t get_perfetto_shmem_size_hint(); @@ -272,9 +261,6 @@ get_perfetto_annotations() ROCPROFSYS_HOT; uint64_t get_thread_pool_size(); -std::string -get_trace_hsa_api_types(); - std::string& get_perfetto_backend(); @@ -282,9 +268,6 @@ get_perfetto_backend(); std::string get_perfetto_output_filename(); -bool -get_perfetto_roctracer_per_stream() ROCPROFSYS_HOT; - double get_trace_delay(); @@ -360,9 +343,6 @@ get_trace_thread_barriers(); bool get_trace_thread_join(); -std::string -get_rocm_events(); - bool get_use_tmp_files(); diff --git a/projects/rocprofiler-systems/source/lib/core/containers/stable_vector.hpp b/projects/rocprofiler-systems/source/lib/core/containers/stable_vector.hpp index f52cf7ebfc..929044d696 100644 --- a/projects/rocprofiler-systems/source/lib/core/containers/stable_vector.hpp +++ b/projects/rocprofiler-systems/source/lib/core/containers/stable_vector.hpp @@ -209,7 +209,7 @@ public: void push_back(Tp&& t); template - void emplace_back(Args&&... args); + decltype(auto) emplace_back(Args&&... args); reference operator[](size_type i); @@ -229,6 +229,14 @@ private: storage_type m_chunks; }; +template +template +decltype(auto) +stable_vector::emplace_back(Args&&... args) +{ + return last_chunk().emplace_back(std::forward(args)...); +} + template stable_vector::stable_vector(size_type count, const Tp& value) { @@ -332,14 +340,6 @@ stable_vector::push_back(Tp&& t) last_chunk().push_back(std::move(t)); } -template -template -void -stable_vector::emplace_back(Args&&... args) -{ - last_chunk().emplace_back(std::forward(args)...); -} - template typename stable_vector::reference stable_vector::operator[](size_type i) diff --git a/projects/rocprofiler-systems/source/lib/core/gpu.cpp b/projects/rocprofiler-systems/source/lib/core/gpu.cpp index 1dfc7c468b..091464f878 100644 --- a/projects/rocprofiler-systems/source/lib/core/gpu.cpp +++ b/projects/rocprofiler-systems/source/lib/core/gpu.cpp @@ -20,22 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#define ROCPROFILER_SDK_CEREAL_NAMESPACE_BEGIN \ + namespace tim \ + { \ + namespace cereal \ + { +#define ROCPROFILER_SDK_CEREAL_NAMESPACE_END \ + } \ + } // namespace ::tim::cereal + #include "common/defines.h" -#if !defined(ROCPROFSYS_USE_ROCM_SMI) -# define ROCPROFSYS_USE_ROCM_SMI 0 -#endif - -#if !defined(ROCPROFSYS_USE_HIP) -# define ROCPROFSYS_USE_HIP 0 -#endif - -#include "core/hip_runtime.hpp" - -#if ROCPROFSYS_USE_HIP > 0 -# if !defined(TIMEMORY_USE_HIP) -# define TIMEMORY_USE_HIP 1 -# endif +#if !defined(ROCPROFSYS_USE_ROCM) +# define ROCPROFSYS_USE_ROCM 0 #endif #include "debug.hpp" @@ -44,24 +41,11 @@ #include -#if ROCPROFSYS_USE_ROCM_SMI > 0 +#if ROCPROFSYS_USE_ROCM > 0 # include -#endif - -#if ROCPROFSYS_USE_HIP > 0 -# include - -# if !defined(ROCPROFSYS_HIP_RUNTIME_CALL) -# define ROCPROFSYS_HIP_RUNTIME_CALL(err) \ - { \ - if(err != ::tim::hip::success_v && (int) err != 0) \ - { \ - ROCPROFSYS_THROW( \ - "[%s:%d] Warning! HIP API call failed with code %i :: %s\n", \ - __FILE__, __LINE__, (int) err, hipGetErrorString(err)); \ - } \ - } -# endif +# include +# include +# include #endif namespace rocprofsys @@ -70,9 +54,7 @@ namespace gpu { namespace { -namespace scope = ::tim::scope; - -#if ROCPROFSYS_USE_ROCM_SMI > 0 +#if ROCPROFSYS_USE_ROCM > 0 # define ROCPROFSYS_ROCM_SMI_CALL(ERROR_CODE) \ ::rocprofsys::gpu::check_rsmi_error(ERROR_CODE, __FILE__, __LINE__) @@ -108,99 +90,47 @@ rsmi_init() return _rsmi_init; } -#endif +#endif // ROCPROFSYS_USE_ROCM > 0 -#if ROCPROFSYS_HIP_VERSION >= 60000 -template ::value, int> = 0> -void -device_prop_serialize(ArchiveT& archive, const char* name, const ArgT& arg) +int32_t +query_rocm_gpu_agents() { - namespace cereal = tim::cereal; - using cereal::make_nvp; - archive(make_nvp(name, arg)); -} - -template -void -device_prop_serialize(ArchiveT& archive, const char* name, ArgT arg[N]) -{ - if constexpr(!std::is_same::value && - !std::is_same::value) - { - namespace cereal = tim::cereal; - using cereal::make_nvp; - auto data = std::array{}; - for(size_t i = 0; i < N; ++i) - data[i] = arg[i]; - archive(make_nvp(name, data)); - } - else - { - device_prop_serialize(archive, name, std::string{ arg }); - } -} - -template -void -device_prop_serialize(ArchiveT& archive, const char* name, hipUUID_t arg) -{ - constexpr auto N = sizeof(arg.bytes); - namespace cereal = tim::cereal; - using cereal::make_nvp; - auto data = std::array{}; - data.fill('\0'); - for(size_t i = 0; i < N; ++i) - data[i] = arg.bytes[i]; - auto str_v = std::string_view{ data.data() }; - auto str = std::string{ str_v }.substr(0, str_v.find('\0')); - archive(make_nvp(name, str)); -} - -template -void -device_prop_serialize(ArchiveT& archive, const char* name, hipDeviceArch_t arg) -{ - namespace cereal = tim::cereal; - using cereal::make_nvp; - -# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(NAME) \ - { \ - auto val = arg.NAME; \ - archive(make_nvp(#NAME, val)); \ + int32_t _dev_cnt = 0; +#if ROCPROFSYS_USE_ROCM > 0 + auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents, + size_t num_agents, void* user_data) -> rocprofiler_status_t { + auto* _cnt = static_cast(user_data); + for(size_t i = 0; i < num_agents; ++i) + { + const auto* _agent = static_cast(agents[i]); + if(_agent && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) *_cnt += 1; } + return ROCPROFILER_STATUS_SUCCESS; + }; - archive.setNextName(name); - archive.startNode(); - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt32Atomics) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalFloatAtomicExch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt32Atomics) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedFloatAtomicExch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFloatAtomicAdd) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt64Atomics) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt64Atomics) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDoubles) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpVote) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpBallot) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpShuffle) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFunnelShift) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasThreadFenceSystem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSyncThreadsExt) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSurfaceFuncs) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(has3dGrid) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDynamicParallelism) - archive.finishNode(); - -# undef ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH -} + try + { + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator, + sizeof(rocprofiler_agent_v0_t), &_dev_cnt); + } catch(std::exception& _e) + { + ROCPROFSYS_BASIC_VERBOSE( + 1, "Exception thrown getting the rocm agents: %s. _dev_cnt=%d\n", _e.what(), + _dev_cnt); + } + // rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator, + // sizeof(rocprofiler_agent_v0_t), &_dev_cnt); #endif + return _dev_cnt; +} } // namespace int -hip_device_count() +rocm_device_count() { -#if ROCPROFSYS_USE_HIP > 0 - return ::tim::hip::device_count(); +#if ROCPROFSYS_USE_ROCM > 0 + static int _num_devices = query_rocm_gpu_agents(); + return _num_devices; #else return 0; #endif @@ -209,7 +139,7 @@ hip_device_count() int rsmi_device_count() { -#if ROCPROFSYS_USE_ROCM_SMI > 0 +#if ROCPROFSYS_USE_ROCM > 0 if(!rsmi_init()) return 0; static auto _num_devices = []() { @@ -234,11 +164,8 @@ rsmi_device_count() int device_count() { -#if ROCPROFSYS_USE_ROCM_SMI > 0 - // store as static since calls after rsmi_shutdown will return zero - return rsmi_device_count(); -#elif ROCPROFSYS_USE_HIP > 0 - return ::tim::hip::device_count(); +#if ROCPROFSYS_USE_ROCM > 0 + return rocm_device_count(); #else return 0; #endif @@ -246,251 +173,44 @@ device_count() template void -add_hip_device_metadata(ArchiveT& ar) +add_device_metadata(ArchiveT& ar) { namespace cereal = tim::cereal; using cereal::make_nvp; -#if ROCPROFSYS_USE_HIP > 0 - int _device_count = 0; - int _current_device = 0; - hipError_t _device_count_err = hipGetDeviceCount(&_device_count); +#if ROCPROFSYS_USE_ROCM > 0 + using agent_vec_t = std::vector; - if(_device_count_err != hipSuccess) return; - - hipError_t _current_device_err = hipGetDevice(&_current_device); - - scope::destructor _dtor{ [_current_device, _current_device_err]() { - if(_current_device_err == hipSuccess) + auto _agents_vec = agent_vec_t{}; + auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents, + size_t num_agents, void* user_data) -> rocprofiler_status_t { + auto* _agents_vec_v = static_cast(user_data); + _agents_vec_v->reserve(num_agents); + for(size_t i = 0; i < num_agents; ++i) { - ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(_current_device)); + const auto* _agent = static_cast(agents[i]); + if(_agent) _agents_vec_v->emplace_back(*_agent); } - } }; + return ROCPROFILER_STATUS_SUCCESS; + }; + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator, + sizeof(rocprofiler_agent_v0_t), &_agents_vec); - if(_current_device_err != hipSuccess || _device_count == 0) return; - - ar.setNextName("hip_device_properties"); - ar.startNode(); - ar.makeArray(); - - scope::destructor _prop_dtor{ [&ar]() { ar.finishNode(); } }; - for(int dev = 0; dev < _device_count; ++dev) - { - auto _device_prop = hipDeviceProp_t{}; - int _driver_version = 0; - int _runtime_version = 0; - ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(dev)); - ROCPROFSYS_HIP_RUNTIME_CALL(hipGetDeviceProperties(&_device_prop, dev)); - ROCPROFSYS_HIP_RUNTIME_CALL(hipDriverGetVersion(&_driver_version)); - ROCPROFSYS_HIP_RUNTIME_CALL(hipRuntimeGetVersion(&_runtime_version)); - - ar.startNode(); - -# if ROCPROFSYS_HIP_VERSION < 60000 - using intvec_t = std::vector; - -# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \ - ar(make_nvp(#NAME, _device_prop.NAME)); - -# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \ - ar(make_nvp(NAME, __VA_ARGS__)); - - ar(make_nvp("name", std::string{ _device_prop.name })); - ar(make_nvp("driver_version", _driver_version)); - ar(make_nvp("runtime_version", _runtime_version)); - ar(make_nvp("capability.major_version", _device_prop.major)); - ar(make_nvp("capability.minor_version", _device_prop.minor)); - - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate) - -# if ROCPROFSYS_HIP_VERSION >= 50000 - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize) -# endif - - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY( - "maxThreadsDim", - intvec_t{ _device_prop.maxThreadsDim[0], _device_prop.maxThreadsDim[1], - _device_prop.maxThreadsDim[2] }) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY( - "maxGridSize", - intvec_t{ _device_prop.maxGridSize[0], _device_prop.maxGridSize[1], - _device_prop.maxGridSize[2] }) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision) -# else -# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \ - device_prop_serialize(ar, #NAME, _device_prop.NAME); - - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(name) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(uuid) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luid) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luidDeviceNodeMask) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsDim) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxGridSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(major) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(minor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(texturePitchAlignment) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deviceOverlap) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DMipmap) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLinear) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DMipmap) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLinear) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DGather) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3DAlt) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemap) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemapLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface3D) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1DLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2DLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemap) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemapLayered) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(surfaceAlignment) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(tccDriver) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asyncEngineCount) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedAddressing) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(persistingL2CacheMaxSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(streamPrioritiesSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(globalL1CacheSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(localL1CacheSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerMultiprocessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerMultiprocessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(managedMemory) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiGpuBoardGroupID) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostNativeAtomicSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(singleToDoublePrecisionPerfRatio) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computePreemptionSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canUseHostPointerForRegisteredMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlockOptin) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxBlocksPerMultiProcessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(accessPolicyMaxWindowSize) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(reservedSharedMemPerBlock) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sparseHipArraySupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterReadOnlySupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(timelineSemaphoreInteropSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolsSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMASupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAFlushWritesOptions) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAWritesOrdering) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolSupportedHandleTypes) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deferredMappingHipArraySupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ipcEventSupported) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clusterLaunch) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedFunctionPointers) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(arch) - // ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpMemFlushCntl) - // ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpRegFlushCntl) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedFunc) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedGridDim) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedBlockDim) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedSharedMem) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isLargeBar) - ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision) -# endif - - const auto _compute_mode_descr = std::array{ - "Default (multiple host threads can use ::hipSetDevice() with device " - "simultaneously)", - "Exclusive (only one host thread in one process is able to use " - "::hipSetDevice() with this device)", - "Prohibited (no host thread can use ::hipSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use " - "::hipSetDevice() with this device)", - "Unknown", - nullptr - }; - - auto _compute_mode = std::min(_device_prop.computeMode, 5); - ar(make_nvp("computeModeDescription", - std::string{ _compute_mode_descr.at(_compute_mode) })); - - ar.finishNode(); - } + ar(make_nvp("rocm_agents", _agents_vec)); #else (void) ar; #endif } void -add_hip_device_metadata() +add_device_metadata() { if(device_count() == 0) return; ROCPROFSYS_METADATA([](auto& ar) { try { - add_hip_device_metadata(ar); + add_device_metadata(ar); } catch(std::runtime_error& _e) { ROCPROFSYS_VERBOSE(2, "%s\n", _e.what()); diff --git a/projects/rocprofiler-systems/source/lib/core/gpu.hpp b/projects/rocprofiler-systems/source/lib/core/gpu.hpp index 0989284b4d..cf8cfe6168 100644 --- a/projects/rocprofiler-systems/source/lib/core/gpu.hpp +++ b/projects/rocprofiler-systems/source/lib/core/gpu.hpp @@ -30,12 +30,12 @@ int device_count(); int -hip_device_count(); +rocm_device_count(); int rsmi_device_count(); void -add_hip_device_metadata(); +add_device_metadata(); } // namespace gpu } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/core/hip_runtime.hpp b/projects/rocprofiler-systems/source/lib/core/hip_runtime.hpp index 9ec902dd7d..24dbff5a78 100644 --- a/projects/rocprofiler-systems/source/lib/core/hip_runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/core/hip_runtime.hpp @@ -24,7 +24,7 @@ #include "core/defines.hpp" -#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0 +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 # if defined(HIP_INCLUDE_HIP_HIP_RUNTIME_H) || \ defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H) @@ -35,22 +35,17 @@ # define HIP_PROF_HIP_API_STRING 1 // following must be included before for ROCm 6.0+ -# if ROCPROFSYS_HIP_VERSION >= 60000 -# if defined(USE_PROF_API) -# undef USE_PROF_API -# endif -# include -# include -// must be included after hip_runtime_api.h -# include -// must be included after hip_runtime_api.h -# include -// must be included after hip_runtime_api.h -# include -# else -# include -# include +# if defined(USE_PROF_API) +# undef USE_PROF_API # endif +# include +# include +// must be included after hip_runtime_api.h +# include +// must be included after hip_runtime_api.h +# include +// must be included after hip_runtime_api.h +# include # include #endif diff --git a/projects/rocprofiler-systems/source/lib/core/perfetto.hpp b/projects/rocprofiler-systems/source/lib/core/perfetto.hpp index 53c11effb3..10e8d3a9e9 100644 --- a/projects/rocprofiler-systems/source/lib/core/perfetto.hpp +++ b/projects/rocprofiler-systems/source/lib/core/perfetto.hpp @@ -104,6 +104,7 @@ perfetto_counter_track::emplace(size_t _idx, const std::string& _v, for(const auto& itr : _name_data) { _missing.emplace_back(std::make_tuple(*itr, itr->c_str(), false)); + // TODO: _missing.emplace_back(*itr, itr->c_str(), false); } } auto _index = _track_data.size(); diff --git a/projects/rocprofiler-systems/source/lib/core/rccl.hpp b/projects/rocprofiler-systems/source/lib/core/rccl.hpp index 68ef13e243..53aec9476d 100644 --- a/projects/rocprofiler-systems/source/lib/core/rccl.hpp +++ b/projects/rocprofiler-systems/source/lib/core/rccl.hpp @@ -23,13 +23,7 @@ #pragma once #include "core/defines.hpp" -#include "core/hip_runtime.hpp" -#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0 && \ - defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 -# if ROCPROFSYS_HIP_VERSION == 0 || ROCPROFSYS_HIP_VERSION >= 50200 -# include -# else -# include -# endif +#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 +# include #endif diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp new file mode 100644 index 0000000000..c53bc25a75 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp @@ -0,0 +1,576 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "core/rocprofiler-sdk.hpp" +#include "core/config.hpp" +#include "core/debug.hpp" +#include "timemory.hpp" +#include + +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 + +# include +# include + +# include +# include +# include + +# include +# include +# include +# include +# include +# include +# include + +# define ROCPROFILER_CALL(result) \ + { \ + rocprofiler_status_t CHECKSTATUS = (result); \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + auto msg = std::stringstream{}; \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \ + << "rocprofiler-sdk call [" << #result \ + << "] failed with error code " << CHECKSTATUS \ + << " :: " << status_msg; \ + ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \ + } \ + } + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +std::string +get_setting_name(std::string _v) +{ + constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" }; + for(auto& itr : _v) + itr = tolower(itr); + auto _pos = _v.find(_prefix); + if(_pos == 0) return _v.substr(_prefix.length()); + return _v; +} + +# define ROCPROFSYS_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \ + [&]() { \ + auto _ret = _config->insert( \ + ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, \ + TYPE{ INITIAL_VALUE }, \ + std::set{ "custom", "rocprofsys", "librocprof-sys", \ + __VA_ARGS__ }); \ + if(!_ret.second) \ + { \ + ROCPROFSYS_PRINT("Warning! Duplicate setting: %s / %s\n", \ + get_setting_name(ENV_NAME).c_str(), ENV_NAME); \ + } \ + return _config->find(ENV_NAME)->second; \ + }() + +template +std::string +to_lower(const Tp& _val) +{ + auto _v = std::string{ _val }; + for(auto& itr : _v) + itr = ::tolower(itr); + return _v; +} + +struct operation_options +{ + std::string operations_include = {}; + std::string operations_exclude = {}; + std::string operations_annotate_backtrace = {}; +}; + +auto callback_operation_option_names = + std::unordered_map{}; +auto buffered_operation_option_names = + std::unordered_map{}; + +std::unordered_set +get_operations_impl(rocprofiler_callback_tracing_kind_t kindv, + const std::string& optname = {}) +{ + static const auto callback_tracing_info = + rocprofiler::sdk::get_callback_tracing_names(); + + if(optname.empty()) + { + auto _ret = std::unordered_set{}; + for(auto iitr : callback_tracing_info[kindv].items()) + { + if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first); + } + return _ret; + } + + auto _val = get_setting_value(optname); + + ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str()); + + if(_val->empty()) return std::unordered_set{}; + + auto _ret = std::unordered_set{}; + for(const auto& itr : tim::delimit(*_val, " ,;:\n\t")) + { + for(auto iitr : callback_tracing_info[kindv].items()) + { + auto _re = std::regex{ itr, std::regex_constants::icase }; + if(iitr.second && std::regex_search(iitr.second->data(), _re)) + { + ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(), + itr.c_str(), iitr.second->data()); + _ret.emplace(iitr.first); + } + } + } + + return _ret; +} + +std::unordered_set +get_operations_impl(rocprofiler_buffer_tracing_kind_t kindv, + const std::string& optname = {}) +{ + static const auto buffered_tracing_info = + rocprofiler::sdk::get_buffer_tracing_names(); + + if(optname.empty()) + { + auto _ret = std::unordered_set{}; + for(auto iitr : buffered_tracing_info[kindv].items()) + { + if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first); + } + return _ret; + } + + auto _val = get_setting_value(optname); + + ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str()); + + if(_val->empty()) return std::unordered_set{}; + + auto _ret = std::unordered_set{}; + for(const auto& itr : tim::delimit(*_val, " ,;:\n\t")) + { + for(auto iitr : buffered_tracing_info[kindv].items()) + { + auto _re = std::regex{ itr, std::regex_constants::icase }; + if(iitr.second && std::regex_search(iitr.second->data(), _re)) + { + ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(), + itr.c_str(), iitr.second->data()); + _ret.emplace(iitr.first); + } + } + } + return _ret; +} + +std::vector +get_operations_impl(const std::unordered_set& _complete, + const std::unordered_set& _include, + const std::unordered_set& _exclude) +{ + auto _convert = [](const auto& _dset) { + auto _dret = std::vector{}; + _dret.reserve(_dset.size()); + for(auto itr : _dset) + _dret.emplace_back(itr); + std::sort(_dret.begin(), _dret.end()); + return _dret; + }; + + if(_include.empty() && _exclude.empty()) return _convert(_complete); + + auto _ret = (_include.empty()) ? _complete : _include; + for(auto itr : _exclude) + _ret.erase(itr); + + return _convert(_ret); +} + +} // namespace + +void +config_settings(const std::shared_ptr& _config) +{ + // const auto agents = std::vector{}; + const auto buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names(); + const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names(); + + auto _skip_domains = + std::unordered_set{ "none", + "correlation_id_retirement", + "marker_core_api", + "marker_control_api", + "marker_name_api", + "code_object" }; + + auto _domain_choices = std::vector{}; + auto _add_domain = [&_domain_choices, &_skip_domains](std::string_view _domain) { + auto _v = to_lower(_domain); + + if(_skip_domains.count(_v) == 0) + { + auto itr = std::find(_domain_choices.begin(), _domain_choices.end(), _v); + if(itr == _domain_choices.end()) _domain_choices.emplace_back(_v); + } + }; + + static auto _option_names = std::unordered_set{}; + auto _add_operation_settings = [&_config, &_skip_domains]( + std::string_view _domain_name, const auto& _domain, + auto& _operation_option_names) { + auto _v = to_lower(_domain_name); + + if(_skip_domains.count(_v) > 0) return; + + auto _op_option_name = JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS"); + auto _eop_option_name = + JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_EXCLUDE"); + auto _bt_option_name = + JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_ANNOTATE_BACKTRACE"); + + auto _op_choices = std::vector{}; + for(auto itr : _domain.operations) + _op_choices.emplace_back(std::string{ itr }); + + if(_op_choices.empty()) return; + + _operation_option_names.emplace( + _domain.value, + operation_options{ _op_option_name, _eop_option_name, _bt_option_name }); + + if(_option_names.emplace(_op_option_name).second) + { + ROCPROFSYS_CONFIG_SETTING( + std::string, _op_option_name.c_str(), + "Inclusive filter for domain operations (for API domains, this selects " + "the functions to trace) [regex supported]", + std::string{}, "rocm", "rocprofiler-sdk", "advanced") + ->set_choices(_op_choices); + } + + if(_option_names.emplace(_eop_option_name).second) + { + ROCPROFSYS_CONFIG_SETTING( + std::string, _eop_option_name.c_str(), + "Exclusive filter for domain operations applied after the inclusive " + "filter (for API domains, removes function from trace) [regex supported]", + std::string{}, "rocm", "rocprofiler-sdk", "advanced") + ->set_choices(_op_choices); + } + + if(_option_names.emplace(_bt_option_name).second) + { + ROCPROFSYS_CONFIG_SETTING( + std::string, _bt_option_name.c_str(), + "Specification of domain operations which will record a backtrace (for " + "API domains, this is a list of function names) [regex supported]", + std::string{}, "rocm", "rocprofiler-sdk", "advanced") + ->set_choices(_op_choices); + } + }; + + _domain_choices.reserve(buffered_tracing_info.size()); + _domain_choices.reserve(callback_tracing_info.size()); + _add_domain("hip_api"); + _add_domain("hsa_api"); + _add_domain("marker_api"); + + for(const auto& itr : buffered_tracing_info) + _add_domain(itr.name); + + for(const auto& itr : callback_tracing_info) + _add_domain(itr.name); + + std::sort(_domain_choices.begin(), _domain_choices.end()); + + namespace join = ::timemory::join; + auto _domain_description = + JOIN("", "Specification of ROCm domains to trace/profile. Choices: ", + join::join(join::array_config{ ", ", "", "" }, _domain_choices)); + + ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_DOMAINS", _domain_description, + std::string{ "hip_runtime_api,marker_api,kernel_dispatch," + "memory_copy,scratch_memory,page_migration" }, + "rocm", "rocprofiler-sdk") + ->set_choices(_domain_choices); + + ROCPROFSYS_CONFIG_SETTING( + std::string, "ROCPROFSYS_ROCM_EVENTS", + "ROCm hardware counters. Use ':device=N' syntax to specify collection on device " + "number N, e.g. ':device=0'. If no device specification is provided, the event " + "is collected on every available device", + "", "rocm", "hardware_counters"); + + _skip_domains.emplace("kernel_dispatch"); + _skip_domains.emplace("page_migration"); + _skip_domains.emplace("scratch_memory"); + + _add_operation_settings( + "MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API], + callback_operation_option_names); + + for(const auto& itr : callback_tracing_info) + _add_operation_settings(itr.name, itr, callback_operation_option_names); + + for(const auto& itr : buffered_tracing_info) + _add_operation_settings(itr.name, itr, buffered_operation_option_names); +} + +std::unordered_set +get_callback_domains() +{ + const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names(); + const auto supported = std::unordered_set{ + ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + }; + + auto _data = std::unordered_set{}; + auto _domains = + tim::delimit(config::get_setting_value("ROCPROFSYS_ROCM_DOMAINS") + .value_or(std::string{}), + " ,;:\t\n"); + + const auto valid_choices = + settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices(); + + auto invalid_domain = [&valid_choices](const auto& domainv) { + return !std::any_of(valid_choices.begin(), valid_choices.end(), + [&domainv](const auto& aitr) { return (aitr == domainv); }); + }; + + for(const auto& itr : _domains) + { + if(invalid_domain(itr)) + { + ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n", + itr.c_str()); + } + + if(itr == "hsa_api") + { + for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API }) + _data.emplace(eitr); + } + else if(itr == "hip_api") + { + for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API }) + _data.emplace(eitr); + } + else if(itr == "marker_api" || itr == "roctx") + { + _data.emplace(ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API); + } + else + { + for(size_t idx = 0; idx < callback_tracing_info.size(); ++idx) + { + auto ditr = callback_tracing_info[idx]; + auto dval = static_cast(idx); + if(itr == to_lower(ditr.name) && supported.count(dval) > 0) + { + _data.emplace(dval); + break; + } + } + } + } + + return _data; +} + +std::unordered_set +get_buffered_domains() +{ + const auto buffer_tracing_info = rocprofiler::sdk::get_buffer_tracing_names(); + const auto supported = std::unordered_set{ + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + }; + + auto _data = std::unordered_set{}; + auto _domains = + tim::delimit(config::get_setting_value("ROCPROFSYS_ROCM_DOMAINS") + .value_or(std::string{}), + " ,;:\t\n"); + const auto valid_choices = + settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices(); + + auto invalid_domain = [&valid_choices](const auto& domainv) { + return !std::any_of(valid_choices.begin(), valid_choices.end(), + [&domainv](const auto& aitr) { return (aitr == domainv); }); + }; + + for(const auto& itr : _domains) + { + if(invalid_domain(itr)) + { + ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n", + itr.c_str()); + } + + if(itr == "hsa_api") + { + for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, + ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API }) + _data.emplace(eitr); + } + else if(itr == "hip_api") + { + for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API, + ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API }) + _data.emplace(eitr); + } + else if(itr == "marker_api" || itr == "roctx") + { + _data.emplace(ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API); + } + else + { + for(size_t idx = 0; idx < buffer_tracing_info.size(); ++idx) + { + auto ditr = buffer_tracing_info[idx]; + auto dval = static_cast(idx); + if(itr == to_lower(ditr.name) && supported.count(dval) > 0) + { + _data.emplace(dval); + break; + } + } + } + } + + return _data; +} + +std::vector +get_rocm_events() +{ + return tim::delimit( + get_setting_value("ROCPROFSYS_ROCM_EVENTS").value_or(std::string{}), + " ,;\t\n"); +} + +std::vector +get_operations(rocprofiler_callback_tracing_kind_t kindv) +{ + ROCPROFSYS_CONDITIONAL_ABORT_F( + callback_operation_option_names.count(kindv) == 0, + "callback_operation_operation_names does not have value for %i\n", kindv); + + auto _complete = get_operations_impl(kindv); + auto _include = get_operations_impl( + kindv, callback_operation_option_names.at(kindv).operations_include); + auto _exclude = get_operations_impl( + kindv, callback_operation_option_names.at(kindv).operations_exclude); + + return get_operations_impl(_complete, _include, _exclude); +} + +std::vector +get_operations(rocprofiler_buffer_tracing_kind_t kindv) +{ + ROCPROFSYS_CONDITIONAL_ABORT_F( + buffered_operation_option_names.count(kindv) == 0, + "buffered_operation_option_names does not have value for %i\n", kindv); + + auto _complete = get_operations_impl(kindv); + auto _include = get_operations_impl( + kindv, buffered_operation_option_names.at(kindv).operations_include); + auto _exclude = get_operations_impl( + kindv, buffered_operation_option_names.at(kindv).operations_exclude); + + return get_operations_impl(_complete, _include, _exclude); +} + +std::unordered_set +get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv) +{ + ROCPROFSYS_CONDITIONAL_ABORT_F( + callback_operation_option_names.count(kindv) == 0, + "callback_operation_operation_names does not have value for %i\n", kindv); + + auto _data = get_operations_impl( + kindv, callback_operation_option_names.at(kindv).operations_annotate_backtrace); + auto _ret = std::unordered_set{}; + _ret.reserve(_data.size()); + for(auto itr : _data) + _ret.emplace(itr); + return _ret; +} + +std::unordered_set +get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv) +{ + ROCPROFSYS_CONDITIONAL_ABORT_F( + buffered_operation_option_names.count(kindv) == 0, + "buffered_operation_option_names does not have value for %i\n", kindv); + + auto _data = get_operations_impl( + kindv, buffered_operation_option_names.at(kindv).operations_annotate_backtrace); + auto _ret = std::unordered_set{}; + _ret.reserve(_data.size()); + for(auto itr : _data) + _ret.emplace(itr); + return _ret; +} +} // namespace rocprofiler_sdk +} // namespace rocprofsys + +#else + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +void +config_settings(const std::shared_ptr&) +{} +} // namespace rocprofiler_sdk +} // namespace rocprofsys + +#endif diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp new file mode 100644 index 0000000000..5ceee14e2a --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp @@ -0,0 +1,70 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/timemory.hpp" + +#if defined(ROCPROFSYS_USE_ROCM) +# include +# include +#endif + +#include +#include +#include +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +void +config_settings(const std::shared_ptr&); + +#if defined(ROCPROFSYS_USE_ROCM) + +std::unordered_set +get_callback_domains(); + +std::unordered_set +get_buffered_domains(); + +std::vector +get_operations(rocprofiler_callback_tracing_kind_t kindv); + +std::vector +get_operations(rocprofiler_buffer_tracing_kind_t kindv); + +std::vector +get_rocm_events(); + +std::unordered_set +get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv); + +std::unordered_set +get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv); + +#endif +} // namespace rocprofiler_sdk +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/core/state.cpp b/projects/rocprofiler-systems/source/lib/core/state.cpp index 7434b7de18..fd8c4d4403 100644 --- a/projects/rocprofiler-systems/source/lib/core/state.cpp +++ b/projects/rocprofiler-systems/source/lib/core/state.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "state.hpp" +#include "common/static_object.hpp" #include "config.hpp" #include "debug.hpp" #include "utility.hpp" @@ -35,8 +36,9 @@ namespace auto& get_state_value() { - static auto _v = std::atomic{ State::PreInit }; - return _v; + static auto*& _v = common::static_object>::construct( + common::do_not_destroy{}, State::PreInit); + return *_v; } ThreadState& diff --git a/projects/rocprofiler-systems/source/lib/core/utility.hpp b/projects/rocprofiler-systems/source/lib/core/utility.hpp index ac0f71fe6d..8ac877edc0 100644 --- a/projects/rocprofiler-systems/source/lib/core/utility.hpp +++ b/projects/rocprofiler-systems/source/lib/core/utility.hpp @@ -74,6 +74,15 @@ get_reserved_vector(size_t _n) return _v; } +/// returns a vector with a preallocated buffer +template +inline decltype(auto) +get_reserved_vector(std::vector&& _v, size_t _n) +{ + _v.reserve(_n); + return std::forward>(_v); +} + template struct offset_index_sequence; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/CMakeLists.txt index 5a36c71128..565b50e6c9 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/CMakeLists.txt @@ -25,7 +25,8 @@ target_include_directories( PUBLIC $ $ $ - $) + $ + PRIVATE ${rocprofiler-sdk_INCLUDE_DIR}) target_link_libraries( rocprofiler-systems-dl-library PUBLIC $ diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl.cpp index 77d59f0e73..3d2f18e016 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl.cpp @@ -54,6 +54,14 @@ #include #include +#if !defined(ROCPROFSYS_USE_ROCM) +# define ROCPROFSYS_USE_ROCM 0 +#endif + +#if ROCPROFSYS_USE_ROCM > 0 +# include +#endif + //--------------------------------------------------------------------------------------// #define ROCPROFSYS_DLSYM(VARNAME, HANDLE, FUNCNAME) \ @@ -79,6 +87,7 @@ //--------------------------------------------------------------------------------------// using main_func_t = int (*)(int, char**, char**); +using init_func_t = void (*)(void); std::ostream& operator<<(std::ostream& _os, const SpaceHandle& _handle) @@ -360,14 +369,8 @@ struct ROCPROFSYS_INTERNAL_API indirect ROCPROFSYS_DLSYM(kokkosp_dual_view_modify_f, m_omnihandle, "kokkosp_dual_view_modify"); -#if ROCPROFSYS_USE_ROCTRACER > 0 - ROCPROFSYS_DLSYM(hsa_on_load_f, m_omnihandle, "OnLoad"); - ROCPROFSYS_DLSYM(hsa_on_unload_f, m_omnihandle, "OnUnload"); -#endif - -#if ROCPROFSYS_USE_ROCPROFILER > 0 - ROCPROFSYS_DLSYM(rocp_on_load_tool_prop_f, m_omnihandle, "OnLoadToolProp"); - ROCPROFSYS_DLSYM(rocp_on_unload_tool_f, m_omnihandle, "OnUnloadTool"); +#if ROCPROFSYS_USE_ROCM > 0 + ROCPROFSYS_DLSYM(rocprofiler_configure_f, m_omnihandle, "rocprofiler_configure"); #endif #if ROCPROFSYS_USE_OMPT == 0 @@ -460,16 +463,9 @@ public: void (*kokkosp_dual_view_sync_f)(const char*, const void* const, bool) = nullptr; void (*kokkosp_dual_view_modify_f)(const char*, const void* const, bool) = nullptr; - // HSA functions -#if ROCPROFSYS_USE_ROCTRACER > 0 - bool (*hsa_on_load_f)(HsaApiTable*, uint64_t, uint64_t, const char* const*) = nullptr; - void (*hsa_on_unload_f)() = nullptr; -#endif - - // ROCP functions -#if ROCPROFSYS_USE_ROCPROFILER > 0 - void (*rocp_on_load_tool_prop_f)(void* settings) = nullptr; - void (*rocp_on_unload_tool_f)() = nullptr; +#if ROCPROFSYS_USE_ROCM > 0 + rocprofiler_tool_configure_result_t* (*rocprofiler_configure_f)( + uint32_t, const char*, uint32_t, rocprofiler_client_id_t*) = nullptr; #endif // OpenMP functions @@ -644,13 +640,18 @@ extern "C" bool _invoked = false; ROCPROFSYS_DL_INVOKE_STATUS(_invoked, get_indirect().rocprofsys_init_f, a, b, c); + if(_invoked) { dl::get_active() = true; dl::get_inited() = true; dl::_rocprofsys_dl_verbose = dl::get_rocprofsys_dl_env(); - if(dl::get_instrumented() < dl::InstrumentMode::PythonProfile) + + if(dl::get_instrumented() >= dl::InstrumentMode::None && + dl::get_instrumented() < dl::InstrumentMode::PythonProfile) + { dl::rocprofsys_postinit((c) ? std::string{ c } : std::string{}); + } } } @@ -1069,43 +1070,17 @@ extern "C" //----------------------------------------------------------------------------------// // - // HSA + // ROCm // //----------------------------------------------------------------------------------// -#if ROCPROFSYS_USE_ROCTRACER > 0 - bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, - const char* const* failed_tool_names) +#if ROCPROFSYS_USE_ROCM > 0 + rocprofiler_tool_configure_result_t* rocprofiler_configure( + uint32_t version, const char* runtime_version, uint32_t priority, + rocprofiler_client_id_t* client_id) { - return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_load_f, table, runtime_version, - failed_tool_count, failed_tool_names); - } - - void OnUnload() { return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_unload_f); } -#endif - - //----------------------------------------------------------------------------------// - // - // ROCP - // - //----------------------------------------------------------------------------------// - -#if ROCPROFSYS_USE_ROCPROFILER > 0 - void OnLoadToolProp(void* settings) - { - ROCPROFSYS_DL_LOG( - -16, - "invoking %s(rocprofiler_settings_t*) within librocprof-sys-dl.so " - "will cause a silent failure for rocprofiler. ROCP_TOOL_LIB " - "should be set to librocprof-sys.so\n", - __FUNCTION__); - abort(); - return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_load_tool_prop_f, settings); - } - - void OnUnloadTool() - { - return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_unload_tool_f); + return ROCPROFSYS_DL_INVOKE(get_indirect().rocprofiler_configure_f, version, + runtime_version, priority, client_id); } #endif @@ -1227,7 +1202,9 @@ rocprofsys_preinit() void rocprofsys_postinit(std::string _exe) { - switch(get_instrumented()) + InstrumentMode instrumentMode = get_instrumented(); + + switch(instrumentMode) { case InstrumentMode::None: case InstrumentMode::BinaryRewrite: @@ -1393,20 +1370,122 @@ verify_instrumented_preloaded() bool _handle_preload = rocprofsys_preload(); main_func_t main_real = nullptr; +init_func_t init_real = nullptr; } // namespace } // namespace dl } // namespace rocprofsys extern "C" { + void rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API; int rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API; + + void rocprofsys_set_main_init(init_func_t) ROCPROFSYS_INTERNAL_API; void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API; + void rocprofsys_set_main_init(init_func_t _init_real) + { + ::rocprofsys::dl::init_real = _init_real; + } + void rocprofsys_set_main(main_func_t _main_real) { ::rocprofsys::dl::main_real = _main_real; } + // void rocprofsys_main_init(int argc, char** argv, char** envp) + // { + // ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__); + // using ::rocprofsys::common::get_env; + // using ::rocprofsys::dl::get_default_mode; + + // // prevent re-entry + // static int _reentry = 0; + // if(_reentry > 0) return -1; + // _reentry = 1; + + // int ret = 0; + + // if(::rocprofsys::dl::init_real) + // { + // if(envp) + // { + // size_t _idx = 0; + // while(envp[_idx] != nullptr) + // { + // auto _env_v = std::string_view{ envp[_idx++] }; + // if(_env_v.find("ROCPROFSYS") != 0 && + // _env_v.find("librocprof-sys") == std::string_view::npos) + // continue; + // auto _pos = _env_v.find('='); + // if(_pos < _env_v.length()) + // { + // auto _var = std::string{ _env_v }.substr(0, _pos); + // auto _val = std::string{ _env_v }.substr(_pos + 1); + // ROCPROFSYS_DL_LOG(1, "%s(%s, %s)\n", "rocprofsys_set_env", + // _var.c_str(), _val.c_str()); + // setenv(_var.c_str(), _val.c_str(), 0); + // } + // } + // } + + // ret = (*::rocprofsys::dl::init_real)(argc, argv, envp); + // } + // else + // { + // ROCPROFSYS_DL_LOG( + // 0, "%s\n", + // "Unsuccessful wrapping of init: nullptr to real init function"); + // } + + // auto _mode = get_env("ROCPROFSYS_MODE", get_default_mode()); + // rocprofsys_init(_mode.c_str(), + // dl::get_instrumented() == dl::InstrumentMode::BinaryRewrite, + // argv[0]); + + // return ret; + // } + + // int rocprofsys_main(int argc, char** argv, char** envp) + // { + // ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__); + + // // prevent re-entry + // static int _reentry = 0; + // if(_reentry > 0) return -1; + // _reentry = 1; + + // if(!::rocprofsys::dl::main_real) + // throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main: + // " + // "nullptr to real main function"); + + // rocprofsys_push_trace(basename(argv[0])); + + // int ret = (*::rocprofsys::dl::main_real)(argc, argv, envp); + + // rocprofsys_pop_trace(basename(argv[0])); + // rocprofsys_finalize(); + + // return ret; + // } + + void rocprofsys_main_init(void) + { + ROCPROFSYS_DL_LOG(0, "[%s].\n", __FUNCTION__); + + if(::rocprofsys::dl::init_real) + { + // Call real init function + (*::rocprofsys::dl::init_real)(); + } + else + { + ROCPROFSYS_DL_LOG( + 0, "Unsuccessful wrapping of init: real_init function is nullptr.\n"); + } + } + int rocprofsys_main(int argc, char** argv, char** envp) { ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__); @@ -1420,7 +1499,7 @@ extern "C" if(!::rocprofsys::dl::main_real) throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main: " - "nullptr to real main function"); + "real_main function is nullptr."); if(envp) { @@ -1455,4 +1534,4 @@ extern "C" return ret; } -} +} // extern "C" diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl/dl.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl/dl.hpp index 1a34612b13..cfd269c7fd 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl/dl.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/dl/dl.hpp @@ -53,12 +53,8 @@ # define ROCPROFSYS_USE_OMPT 0 #endif -#if !defined(ROCPROFSYS_USE_ROCTRACER) -# define ROCPROFSYS_USE_ROCTRACER 0 -#endif - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) -# define ROCPROFSYS_USE_ROCPROFILER 0 +#if !defined(ROCPROFSYS_USE_ROCM) +# define ROCPROFSYS_USE_ROCM 0 #endif //--------------------------------------------------------------------------------------// @@ -177,20 +173,12 @@ extern "C" const char*) ROCPROFSYS_PUBLIC_API; # endif -# if ROCPROFSYS_USE_ROCTRACER > 0 - // HSA - struct HsaApiTable; - bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, - const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API; - void OnUnload() ROCPROFSYS_PUBLIC_API; +# if ROCPROFSYS_USE_ROCM > 0 + struct rocprofiler_tool_configure_result_t; + struct rocprofiler_client_id_t; # endif -# if ROCPROFSYS_USE_ROCPROFILER > 0 - // ROCP - void OnLoadToolProp(void* settings) ROCPROFSYS_PUBLIC_API; - void OnUnloadTool() ROCPROFSYS_PUBLIC_API; -# endif -#endif +#endif // ROCPROFSYS_DL_SOURCE } namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/main.c b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/main.c index 7391a4e7a0..a654efaade 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/main.c +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-dl/main.c @@ -37,20 +37,19 @@ // local type definitions // typedef int (*main_func_t)(int, char**, char**); -typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**, - int (*)(int, char**, char**), void (*)(void), void (*)(void), - void*); +typedef void (*init_func_t)(void); +typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**, void (*)(void), + void (*)(void), void (*)(void), void*); // // local function declarations // int -rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**, - int (*)(int, char**, char**), void (*)(void), void (*)(void), - void*) ROCPROFSYS_INTERNAL_API; +rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void), + void (*)(void), void (*)(void), void*) ROCPROFSYS_INTERNAL_API; int -__libc_start_main(int (*)(int, char**, char**), int, char**, int (*)(int, char**, char**), +__libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void), void (*)(void), void (*)(void), void*) ROCPROFSYS_PUBLIC_API; // @@ -79,12 +78,18 @@ basename(const char*); extern void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API; +extern void +rocprofsys_set_main_init(init_func_t func) ROCPROFSYS_INTERNAL_API; + +extern void +rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API; + extern int rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API; int rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv, - int (*_init)(int, char**, char**), void (*_fini)(void), + void (*_init)(void), void (*_fini)(void), void (*_rtld_fini)(void), void* _stack_end) { int _preload = rocprofsys_preload_library(); @@ -97,8 +102,9 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** // get the address of this function void* _this_func = __builtin_return_address(0); - // Save the real main function address + // Save the real main function addresses rocprofsys_set_main(_main); + rocprofsys_set_main_init(_init); // Find the real __libc_start_main() start_main_t user_main = dlsym(RTLD_NEXT, "__libc_start_main"); @@ -115,6 +121,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** } else { + // return user_main(rocprofsys_main, _argc, _argv, + // rocprofsys_main_init, _fini, + // _rtld_fini, _stack_end); + // call rocprof-sys main function wrapper return user_main(rocprofsys_main, _argc, _argv, _init, _fini, _rtld_fini, _stack_end); @@ -129,9 +139,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** int __libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv, - int (*_init)(int, char**, char**), void (*_fini)(void), - void (*_rtld_fini)(void), void* _stack_end) + void (*_init)(void), void (*_fini)(void), void (*_rtld_fini)(void), + void* _stack_end) { + // intercept the main function return rocprofsys_libc_start_main(_main, _argc, _argv, _init, _fini, _rtld_fini, _stack_end); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index d559799ba0..fbc17bc7cf 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -43,19 +43,21 @@ extern "C" ROCPROFSYS_CATEGORY_PYTHON, ROCPROFSYS_CATEGORY_USER, ROCPROFSYS_CATEGORY_HOST, - ROCPROFSYS_CATEGORY_DEVICE_HIP, - ROCPROFSYS_CATEGORY_DEVICE_HSA, - ROCPROFSYS_CATEGORY_ROCM_HIP, - ROCPROFSYS_CATEGORY_ROCM_HSA, - ROCPROFSYS_CATEGORY_ROCM_ROCTX, + ROCPROFSYS_CATEGORY_ROCM, + ROCPROFSYS_CATEGORY_ROCM_HIP_API, + ROCPROFSYS_CATEGORY_ROCM_HSA_API, + ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, + ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, + ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, + ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, + ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, + ROCPROFSYS_CATEGORY_ROCM_MARKER_API, ROCPROFSYS_CATEGORY_ROCM_SMI, ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, ROCPROFSYS_CATEGORY_ROCM_RCCL, - ROCPROFSYS_CATEGORY_ROCTRACER, - ROCPROFSYS_CATEGORY_ROCPROFILER, ROCPROFSYS_CATEGORY_SAMPLING, ROCPROFSYS_CATEGORY_PTHREAD, ROCPROFSYS_CATEGORY_KOKKOS, diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp index e5d1017144..92ee2b4874 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp @@ -26,6 +26,7 @@ // #include "api.hpp" #include "common/setup.hpp" +#include "common/static_object.hpp" #include "core/categories.hpp" #include "core/components/fwd.hpp" #include "core/concepts.hpp" @@ -46,13 +47,12 @@ #include "library/components/mpi_gotcha.hpp" #include "library/components/numa_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/rocprofiler.hpp" #include "library/coverage.hpp" #include "library/ompt.hpp" #include "library/process_sampler.hpp" #include "library/ptl.hpp" #include "library/rcclp.hpp" -#include "library/rocprofiler.hpp" +#include "library/rocprofiler-sdk.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -399,10 +399,6 @@ rocprofsys_init_library_hidden() if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", _debug_value); } }; - tim::trait::runtime_enabled::set(get_use_roctracer()); - tim::trait::runtime_enabled::set(get_use_roctracer() && - get_use_timemory()); - ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n"); } @@ -718,13 +714,6 @@ rocprofsys_finalize_hidden(void) } } - if(get_use_roctracer()) - { - ROCPROFSYS_VERBOSE_F(1, "Flushing roctracer...\n"); - // ensure that roctracer is flushed before setting the state to finalized - comp::roctracer::flush(); - } - set_state(State::Finalized); push_enable_sampling_on_child_threads(false); @@ -785,6 +774,14 @@ rocprofsys_finalize_hidden(void) ompt::shutdown(); } +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 + // TODO: option for rocm + { + ROCPROFSYS_VERBOSE_F(1, "Shutting down ROCm...\n"); + rocprofiler_sdk::shutdown(); + } +#endif + ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) { @@ -835,24 +832,6 @@ rocprofsys_finalize_hidden(void) process_sampler::shutdown(); } - if(get_use_roctracer()) - { - ROCPROFSYS_VERBOSE_F(1, "Shutting down roctracer...\n"); - // ensure that threads running roctracer callbacks shutdown - comp::roctracer::shutdown(); - - // join extra thread(s) used by roctracer - ROCPROFSYS_VERBOSE_F(2, "Waiting on roctracer tasks...\n"); - tasking::join(); - } - - if(get_use_rocprofiler()) - { - ROCPROFSYS_VERBOSE_F(1, "Shutting down rocprofiler...\n"); - rocprofiler::post_process(); - rocprofiler::rocm_cleanup(); - } - if(get_use_causal()) { ROCPROFSYS_VERBOSE_F(1, "Shutting down causal sampling...\n"); @@ -919,7 +898,7 @@ rocprofsys_finalize_hidden(void) process_sampler::post_process(); } - // shutdown tasking before timemory is finalized, especially the roctracer thread-pool + // shutdown tasking before timemory is finalized ROCPROFSYS_VERBOSE_F(1, "Shutting down thread-pools...\n"); tasking::shutdown(); @@ -991,6 +970,8 @@ rocprofsys_finalize_hidden(void) tim::signals::enable_signal_detection( { tim::signals::sys_signal::SegFault, tim::signals::sys_signal::Stop }, [](int) {}); + + common::destroy_static_objects(); } //======================================================================================// diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt index 776b06b5e2..5084c43958 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt @@ -23,8 +23,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.hpp - ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp - ${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp + ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp ${CMAKE_CURRENT_LIST_DIR}/runtime.hpp ${CMAKE_CURRENT_LIST_DIR}/sampling.hpp ${CMAKE_CURRENT_LIST_DIR}/thread_data.hpp @@ -35,37 +34,23 @@ set(library_headers target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources} ${library_headers}) -if(ROCPROFSYS_USE_ROCTRACER OR ROCPROFSYS_USE_ROCPROFILER) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp) -endif() - -if(ROCPROFSYS_USE_ROCTRACER) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp) -endif() - if(ROCPROFSYS_USE_RCCL) target_sources(rocprofiler-systems-object-library PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) endif() -if(ROCPROFSYS_USE_ROCPROFILER) +if(ROCPROFSYS_USE_ROCM) target_sources( rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp - ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp) -endif() - -if(ROCPROFSYS_USE_ROCM_SMI) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp) + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp + ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp + ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp) + add_subdirectory(rocprofiler-sdk) endif() add_subdirectory(causal) add_subdirectory(components) add_subdirectory(coverage) -add_subdirectory(rocm) add_subdirectory(tracing) set(ndebug_sources diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt index 16f841a8a7..4c66da1551 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt @@ -28,8 +28,6 @@ set(component_headers ${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp - ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp - ${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp) @@ -37,16 +35,6 @@ set(component_headers target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources} ${component_headers}) -if(ROCPROFSYS_USE_ROCPROFILER) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp) -endif() - -if(ROCPROFSYS_USE_ROCTRACER) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp) -endif() - if(ROCPROFSYS_USE_RCCL) target_sources(rocprofiler-systems-object-library PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp index ddb571048a..77d22faef8 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp @@ -64,13 +64,13 @@ using tim::type_list; // these categories increment push/pop counts, which are used for sanity checks since // they should ALWAYS be popped if they were pushed using tracing_count_categories_t = - type_list; + type_list; // convert these categories to throughput points using causal_throughput_categories_t = - type_list; + type_list; // define this outside of category region functions so that the // static thread_local is global instead of per-template instantiation diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index 39a370f405..fcf354721e 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -28,7 +28,6 @@ #include "core/utility.hpp" #include "library/causal/delay.hpp" #include "library/components/category_region.hpp" -#include "library/components/roctracer.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -61,7 +60,7 @@ shutdown(); namespace component { -using bundle_t = tim::lightweight_tuple; +using bundle_t = tim::lightweight_tuple; using category_region_t = tim::lightweight_tuple>; namespace @@ -82,7 +81,6 @@ inline void start_bundle(bundle_t& _bundle, int64_t _tid, Args&&... _args) { if(!get_use_timemory() && !get_use_perfetto()) return; - trait::runtime_enabled::set(get_use_roctracer()); ROCPROFSYS_BASIC_VERBOSE_F(3, "starting bundle '%s' in thread %li...\n", _bundle.key().c_str(), _tid); if constexpr(sizeof...(Args) > 0) @@ -619,5 +617,3 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, } } // namespace component } // namespace rocprofsys - -TIMEMORY_INITIALIZE_STORAGE(component::roctracer_data) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.cpp deleted file mode 100644 index 0253436adc..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.cpp +++ /dev/null @@ -1,193 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/rocprofiler.hpp" -#include "core/common.hpp" -#include "core/config.hpp" -#include "core/debug.hpp" -#include "core/defines.hpp" -#include "core/dynamic_library.hpp" -#include "core/perfetto.hpp" -#include "core/redirect.hpp" -#include "library/rocprofiler.hpp" -#include "library/sampling.hpp" -#include "library/thread_data.hpp" - -#include -#include -#include -#include - -#include - -#include -#include -#include - -namespace rocprofsys -{ -namespace component -{ -namespace -{ -auto& -rocprofiler_activity_count() -{ - static std::atomic _v{ 0 }; - return _v; -} -} // namespace - -unique_ptr_t& -rocm_data(int64_t _tid) -{ - using thread_data_t = thread_data; - return thread_data_t::instance(construct_on_thread{ _tid }); -} - -rocm_event::rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue, - std::string _event_name, rocm_metric_type _begin, - rocm_metric_type _end, uint32_t _feature_count, void* _features_v) -: device_id{ _dev } -, thread_id{ _thr } -, queue_id{ _queue } -, entry{ _begin } -, exit{ _end } -, name(std::move(_event_name)) -{ - feature_values.reserve(_feature_count); - feature_names.reserve(_feature_count); - auto* _features = static_cast(_features_v); - for(uint32_t i = 0; i < _feature_count; ++i) - { - const rocprofiler_feature_t* p = &_features[i]; - feature_names.emplace_back(i); - switch(p->data.kind) - { - // Output metrics results - case ROCPROFILER_DATA_KIND_UNINIT: break; - case ROCPROFILER_DATA_KIND_BYTES: - feature_values.emplace_back( - rocm_feature_value{ p->data.result_bytes.size }); - break; - case ROCPROFILER_DATA_KIND_INT32: - feature_values.emplace_back(rocm_feature_value{ p->data.result_int32 }); - break; - case ROCPROFILER_DATA_KIND_FLOAT: - feature_values.emplace_back(rocm_feature_value{ p->data.result_float }); - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - feature_values.emplace_back(rocm_feature_value{ p->data.result_double }); - break; - case ROCPROFILER_DATA_KIND_INT64: - feature_values.emplace_back(rocm_feature_value{ p->data.result_int64 }); - break; - } - } -} - -std::string -rocm_event::as_string() const -{ - std::stringstream _ss{}; - _ss << name << ", device: " << device_id << ", queue: " << queue_id - << ", thread: " << thread_id << ", entry: " << entry << ", exit = " << exit; - _ss.precision(3); - _ss << std::fixed; - for(size_t i = 0; i < feature_names.size(); ++i) - { - auto _name = rocprofsys::rocprofiler::get_data_labels().at(device_id).at( - feature_names.at(i)); - _ss << ", " << _name << " = "; - auto _as_string = [&_ss](auto&& itr) { _ss << std::setw(4) << itr; }; - std::visit(_as_string, feature_values.at(i)); - } - return _ss.str(); -} - -void -rocprofiler::preinit() -{ - rocprofiler_data::label() = "rocprofiler"; - rocprofiler_data::description() = "ROCm hardware counters"; -} - -void -rocprofiler::start() -{ - if(tracker_type::start() == 0) setup(); -} - -void -rocprofiler::stop() -{ - if(tracker_type::stop() == 0) shutdown(); -} - -bool -rocprofiler::is_setup() -{ - return rocprofsys::rocprofiler::is_setup(); -} - -void -rocprofiler::add_setup(const std::string&, std::function&&) -{} - -void -rocprofiler::add_shutdown(const std::string&, std::function&&) -{} - -void -rocprofiler::remove_setup(const std::string&) -{} - -void -rocprofiler::remove_shutdown(const std::string&) -{} - -void -rocprofiler::setup() -{ - ROCPROFSYS_VERBOSE_F(1, "rocprofiler is setup\n"); -} - -void -rocprofiler::shutdown() -{ - rocprofsys::rocprofiler::post_process(); - rocprofsys::rocprofiler::rocm_cleanup(); - ROCPROFSYS_VERBOSE_F(1, "rocprofiler is shutdown\n"); -} - -scope::transient_destructor -rocprofiler::protect_flush_activity() -{ - return scope::transient_destructor([]() { --rocprofiler_activity_count(); }, - []() { ++rocprofiler_activity_count(); }); -} -} // namespace component -} // namespace rocprofsys - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler, false, void) -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler_data, true, - tim::component::rocprofiler_value) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.hpp deleted file mode 100644 index 27326e9dda..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rocprofiler.hpp +++ /dev/null @@ -1,241 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "library/thread_data.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace rocprofsys -{ -namespace component -{ -using rocm_metric_type = unsigned long long; -using rocm_info_entry = ::tim::hardware_counters::info; -using rocm_feature_value = std::variant; - -struct rocm_counter -{ - std::array counters; -}; - -struct rocm_event -{ - using value_type = rocm_feature_value; - - uint32_t device_id = 0; - uint32_t thread_id = 0; - uint32_t queue_id = 0; - rocm_metric_type entry = 0; - rocm_metric_type exit = 0; - std::string name = {}; - std::vector feature_names = {}; - std::vector feature_values = {}; - - rocm_event() = default; - rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue, std::string _event_name, - rocm_metric_type begin, rocm_metric_type end, uint32_t _feature_count, - void* _features); - - std::string as_string() const; - - friend std::ostream& operator<<(std::ostream& _os, const rocm_event& _v) - { - return (_os << _v.as_string()); - } - - friend bool operator<(const rocm_event& _lhs, const rocm_event& _rhs) - { - return std::tie(_lhs.device_id, _lhs.queue_id, _lhs.entry, _lhs.thread_id) < - std::tie(_rhs.device_id, _rhs.queue_id, _rhs.entry, _rhs.thread_id); - } -}; - -using rocm_data_t = std::vector; -using rocm_data_tracker = data_tracker; - -rocprofsys::unique_ptr_t& -rocm_data(int64_t _tid = threading::get_id()); - -using rocprofiler_value = typename rocm_event::value_type; -using rocprofiler_data = data_tracker; - -struct rocprofiler -: base -, private policy::instance_tracker -{ - using value_type = void; - using base_type = base; - using tracker_type = policy::instance_tracker; - - ROCPROFSYS_DEFAULT_OBJECT(rocprofiler) - - static void preinit(); - static void global_init() { setup(); } - static void global_finalize() { shutdown(); } - - static bool is_setup(); - static void setup(); - static void shutdown(); - static void add_setup(const std::string&, std::function&&); - static void add_shutdown(const std::string&, std::function&&); - static void remove_setup(const std::string&); - static void remove_shutdown(const std::string&); - - void start(); - void stop(); - - // this function protects rocprofiler_flush_activty from being called - // when rocprof-sys exits during a callback - [[nodiscard]] static scope::transient_destructor protect_flush_activity(); -}; - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) -inline void -rocprofiler::setup() -{} - -inline void -rocprofiler::shutdown() -{} - -inline bool -rocprofiler::is_setup() -{ - return false; -} -#endif -} // namespace component -} // namespace rocprofsys - -namespace tim -{ -namespace component -{ -using ::rocprofsys::component::rocm_data_tracker; -using ::rocprofsys::component::rocm_feature_value; -using ::rocprofsys::component::rocprofiler_data; -using ::rocprofsys::component::rocprofiler_value; -} // namespace component -} // namespace tim - -namespace tim -{ -namespace operation -{ -template <> -struct set_storage -{ - using T = component::rocm_data_tracker; - static constexpr size_t max_threads = 4096; - using type = T; - using storage_array_t = std::array*, max_threads>; - friend struct get_storage; - - ROCPROFSYS_DEFAULT_OBJECT(set_storage) - - auto operator()(storage*, size_t) const {} - auto operator()(type&, size_t) const {} - auto operator()(storage* _v) const { get().fill(_v); } - -private: - static storage_array_t& get() - { - static storage_array_t _v = { nullptr }; - return _v; - } -}; - -template <> -struct get_storage -{ - using type = component::rocm_data_tracker; - - ROCPROFSYS_DEFAULT_OBJECT(get_storage) - - auto operator()(const type&) const - { - return operation::set_storage::get().at(0); - } - - auto operator()() const - { - type _obj{}; - return (*this)(_obj); - } - - auto operator()(size_t _idx) const - { - return operation::set_storage::get().at(_idx); - } - - auto operator()(type&, size_t _idx) const { return (*this)(_idx); } -}; -} // namespace operation -} // namespace tim - -#if !defined(ROCPROFSYS_USE_ROCPROFILER) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler_data, false_type) -#endif - -TIMEMORY_SET_COMPONENT_API(component::rocprofiler_data, project::timemory, - category::timing, os::supports_unix) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::rocprofiler_data, - false_type) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::rocprofiler_data, - false_type) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocprofiler_data, false_type) -TIMEMORY_STATISTICS_TYPE(component::rocprofiler_data, component::rocprofiler_value) -TIMEMORY_STATISTICS_TYPE(component::rocm_data_tracker, component::rocm_feature_value) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocm_data_tracker, false_type) - -#if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \ - (defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0) - -# include - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler, false, void) -ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler_data, true, double) - -#endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.cpp deleted file mode 100644 index 29ba18385f..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.cpp +++ /dev/null @@ -1,396 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/roctracer.hpp" -#include "core/common.hpp" -#include "core/config.hpp" -#include "core/debug.hpp" -#include "core/defines.hpp" -#include "core/dynamic_library.hpp" -#include "core/redirect.hpp" -#include "library/roctracer.hpp" -#include "library/runtime.hpp" -#include "library/thread_data.hpp" -#include "library/thread_info.hpp" - -#include -#include - -#define HIP_PROF_HIP_API_STRING 1 - -#include -#include - -#if ROCPROFSYS_HIP_VERSION < 50300 -# include -#endif - -#define AMD_INTERNAL_BUILD 1 -#include - -namespace rocprofsys -{ -namespace component -{ -namespace -{ -auto& -roctracer_activity_count() -{ - static std::atomic _v{ 0 }; - return _v; -} -} // namespace - -void -roctracer::preinit() -{ - roctracer_data::label() = "roctracer"; - roctracer_data::description() = "ROCm tracer (activity API)"; -} - -void -roctracer::start() -{ - if(tracker_type::start() == 0) setup(nullptr); -} - -void -roctracer::stop() -{ - if(tracker_type::stop() == 0) shutdown(); -} - -bool -roctracer::is_setup() -{ - return roctracer_is_setup(); -} - -void -roctracer::add_setup(const std::string& _lbl, std::function&& _func) -{ - roctracer_setup_routines().emplace_back(_lbl, std::move(_func)); -} - -void -roctracer::add_shutdown(const std::string& _lbl, std::function&& _func) -{ - roctracer_shutdown_routines().emplace_back(_lbl, std::move(_func)); -} - -void -roctracer::remove_setup(const std::string& _lbl) -{ - auto& _data = roctracer_setup_routines(); - for(auto itr = _data.begin(); itr != _data.end(); ++itr) - { - if(itr->first == _lbl) - { - _data.erase(itr); - break; - } - } -} - -void -roctracer::remove_shutdown(const std::string& _lbl) -{ - auto& _data = roctracer_setup_routines(); - for(auto itr = _data.begin(); itr != _data.end(); ++itr) - { - if(itr->first == _lbl) - { - _data.erase(itr); - break; - } - } -} - -void -roctracer::setup(void* table, bool on_load_trace) -{ - if(!get_use_roctracer()) return; - - auto_lock_t _lk{ type_mutex() }; - if(roctracer_is_setup()) return; - roctracer_is_setup() = true; - - ROCPROFSYS_VERBOSE_F(1, "setting up roctracer...\n"); - ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false); - - dynamic_library _amdhip64{ "ROCPROFSYS_ROCTRACER_LIBAMDHIP64", - find_library_path("libamdhip64.so", - { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" }, - { ROCPROFSYS_DEFAULT_ROCM_PATH }) }; - -#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR < 4 - dynamic_library _kfdwrapper{ - "ROCPROFSYS_ROCTRACER_LIBKFDWRAPPER", - find_library_path("libkfdwrapper64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" }, - { ROCPROFSYS_DEFAULT_ROCM_PATH }, - { "roctracer/lib", "roctracer/lib64", "lib", "lib64" }) - }; -#endif - - ROCPROFSYS_ROCTRACER_CALL(roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr)); - - // Allocating tracing pool - roctracer_properties_t properties{}; - memset(&properties, 0, sizeof(roctracer_properties_t)); - // properties.mode = 0x1000; - properties.buffer_size = 0x100; - properties.buffer_callback_fun = hip_activity_callback; - ROCPROFSYS_ROCTRACER_CALL(roctracer_open_pool(&properties)); - -#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4 - // HIP 4.5.0 has an invalid warning - redirect _rd{ std::cerr, "roctracer_enable_callback(), get_op_end(), invalid domain " - "ID(4) in: roctracer_enable_callback(hip_api_callback, " - "nullptr)roctracer_enable_activity_expl(), get_op_end(), " - "invalid domain ID(4) in: roctracer_enable_activity()" }; -#endif - - if(get_trace_hip_api()) - { - ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback( - ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr)); - } - - if(get_use_roctx()) - { - ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback( - ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, nullptr)); - } - - if(get_trace_hip_activity()) - { - // Enable HIP activity tracing - ROCPROFSYS_ROCTRACER_CALL( - roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)); - } - - if(table != nullptr) - { - ROCPROFSYS_VERBOSE(1 || on_load_trace, "[OnLoad] setting up HSA...\n"); - - bool trace_hsa_api = get_trace_hsa_api(); - - // Enable HSA API callbacks/activity - if(trace_hsa_api) - { - std::vector hsa_api_vec = - tim::delimit(get_trace_hsa_api_types()); - - // initialize HSA tracing - roctracer_set_properties( - static_cast(ACTIVITY_DOMAIN_HSA_API), (void*) table); - - if(!hsa_api_vec.empty()) - { - for(const auto& itr : hsa_api_vec) - { - uint32_t cid = HSA_API_ID_NUMBER; - const char* api = itr.c_str(); - ROCPROFSYS_ROCTRACER_CALL(roctracer_op_code( - static_cast(ACTIVITY_DOMAIN_HSA_API), api, - &cid, nullptr)); - ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_callback( - static_cast(ACTIVITY_DOMAIN_HSA_API), cid, - hsa_api_callback, nullptr)); - - ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace(%s)", api); - } - } - else - { - ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace()\n"); - ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback( - static_cast(ACTIVITY_DOMAIN_HSA_API), - hsa_api_callback, nullptr)); - } - } - - bool trace_hsa_activity = get_trace_hsa_activity(); - // Enable HSA GPU activity - if(trace_hsa_activity) - { -#if ROCPROFSYS_HIP_VERSION < 50300 - using namespace roctracer; - // initialize HSA tracing - const char* output_prefix = nullptr; - hsa_ops_properties_t ops_properties{ - table, reinterpret_cast(hsa_activity_callback), - nullptr, output_prefix - }; -#elif ROCPROFSYS_HIP_VERSION < 50301 - hsa_ops_properties_t ops_properties; - ops_properties.table = table; - ops_properties.reserved1[0] = reinterpret_cast(&hsa_activity_callback); - ops_properties.reserved1[1] = nullptr; - ops_properties.reserved1[2] = nullptr; -#else - hsa_ops_properties_t ops_properties{ - table, reinterpret_cast(&hsa_activity_callback), nullptr, nullptr - }; -#endif - roctracer_set_properties( - static_cast(ACTIVITY_DOMAIN_HSA_OPS), &ops_properties); - - ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-activity-trace()\n"); - ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_activity( - static_cast(ACTIVITY_DOMAIN_HSA_OPS), HSA_OP_ID_COPY)); - } - } - - // callback for HSA - for(auto& itr : roctracer_setup_routines()) - itr.second(); - - // make sure all async callbacks are allocated - for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) - hip_exec_activity_callbacks(i); - - ROCPROFSYS_VERBOSE_F(1, "roctracer is setup\n"); -} - -void -roctracer::flush() -{ - auto wait_for_activity_flush_completion = []() { - uint16_t nitr = 0; - while(roctracer_activity_count() > 0 && nitr++ < 10) - std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); - }; - - // a flush may already be happening - wait_for_activity_flush_completion(); - - if(roctracer_activity_count() == 0) - { - ROCPROFSYS_VERBOSE_F(2, "executing roctracer_flush_activity()...\n"); - ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity()); - // wait to make sure flush completes - std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); - wait_for_activity_flush_completion(); - } - else - { - ROCPROFSYS_CI_FAIL(true, - "roctracer_activity_count() != 0 (== %li). " - "roctracer::shutdown() most likely called during abort", - roctracer_activity_count().load()); - } - - ROCPROFSYS_VERBOSE_F(2, "executing hip_exec_activity_callbacks(0..%zu)\n", - thread_info::get_peak_num_threads()); - // make sure all async operations are executed - for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) - hip_exec_activity_callbacks(i); - - ROCPROFSYS_VERBOSE_F(2, "roctracer flush completed\n"); -} - -void -roctracer::shutdown() -{ - auto_lock_t _lk{ type_mutex() }; - if(!roctracer_is_setup()) return; - - roctracer_is_setup() = false; - - ROCPROFSYS_VERBOSE_F(1, "shutting down roctracer...\n"); - - // callback for hsa - ROCPROFSYS_VERBOSE_F(2, "executing %zu roctracer_shutdown_routines...\n", - roctracer_shutdown_routines().size()); - for(auto& itr : roctracer_shutdown_routines()) - itr.second(); - -#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4 - ROCPROFSYS_DEBUG_F("redirecting roctracer warnings\n"); - // HIP 4.5.0 has an invalid warning - redirect _rd{ - std::cerr, "roctracer_disable_callback(), get_op_end(), invalid domain ID(4) " - "in: roctracer_disable_callback()roctracer_disable_activity(), " - "get_op_end(), invalid domain ID(4) in: roctracer_disable_activity()" - }; -#endif - - if(get_trace_hip_api()) - { - ROCPROFSYS_VERBOSE_F( - 2, - "executing roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)...\n"); - ROCPROFSYS_ROCTRACER_CALL( - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)); - } - - if(get_use_roctx()) - { - ROCPROFSYS_VERBOSE_F( - 2, "executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_ROCTX)...\n"); - ROCPROFSYS_ROCTRACER_CALL( - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX)); - } - - if(get_trace_hip_activity()) - { - ROCPROFSYS_VERBOSE_F( - 2, - "executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)...\n"); - ROCPROFSYS_ROCTRACER_CALL( - roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)); - } - - if(get_trace_hsa_api()) - { - ROCPROFSYS_VERBOSE_F( - 2, - "executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_API)...\n"); - ROCPROFSYS_ROCTRACER_CALL( - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_API)); - } - - if(get_trace_hsa_api()) - { - ROCPROFSYS_VERBOSE_F( - 2, "executing roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, " - "HSA_OP_ID_COPY)...\n"); - ROCPROFSYS_ROCTRACER_CALL( - roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY)); - } - - ROCPROFSYS_VERBOSE_F(1, "roctracer is shutdown\n"); -} - -scope::transient_destructor -roctracer::protect_flush_activity() -{ - return scope::transient_destructor([]() { --roctracer_activity_count(); }, - []() { ++roctracer_activity_count(); }); -} -} // namespace component -} // namespace rocprofsys - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer, false, void) -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer_data, true, double) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.hpp deleted file mode 100644 index 93c9f6055a..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/roctracer.hpp +++ /dev/null @@ -1,117 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -ROCPROFSYS_COMPONENT_ALIAS(roctracer_data, - ::tim::component::data_tracker) - -namespace rocprofsys -{ -namespace component -{ -struct roctracer -: base -, private policy::instance_tracker -{ - using value_type = void; - using base_type = base; - using tracker_type = policy::instance_tracker; - - ROCPROFSYS_DEFAULT_OBJECT(roctracer) - - static void preinit(); - static void global_finalize() { shutdown(); } - - static bool is_setup(); - static void setup(void* hsa_api_table, bool on_load_trace = false); - static void flush(); - static void shutdown(); - static void add_setup(const std::string&, std::function&&); - static void add_shutdown(const std::string&, std::function&&); - static void remove_setup(const std::string&); - static void remove_shutdown(const std::string&); - - void start(); - void stop(); - - // this function protects roctracer_flush_activty from being called - // when rocprof-sys exits during a callback - [[nodiscard]] static scope::transient_destructor protect_flush_activity(); -}; - -#if !defined(ROCPROFSYS_USE_ROCTRACER) -inline void -roctracer::setup(void*, bool) -{} - -inline void -roctracer::flush() -{} - -inline void -roctracer::shutdown() -{} - -inline bool -roctracer::is_setup() -{ - return false; -} -#endif -} // namespace component -} // namespace rocprofsys - -#if !defined(ROCPROFSYS_USE_ROCTRACER) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer_data, false_type) -#endif - -TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer_data, project::timemory, - category::timing, os::supports_unix) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type) - -#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0 -# if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \ - (defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0) - -# include - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer, false, void) -ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double) - -# endif -#endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp index bd8505708d..7afc4b3efb 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp @@ -25,12 +25,8 @@ #include "core/debug.hpp" #include "core/dynamic_library.hpp" #include "core/gpu.hpp" -#include "library/components/rocprofiler.hpp" -#include "library/components/roctracer.hpp" -#include "library/rocm/hsa_rsrc_factory.hpp" #include "library/rocm_smi.hpp" -#include "library/rocprofiler.hpp" -#include "library/roctracer.hpp" +#include "library/rocprofiler-sdk.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" #include "library/tracing.hpp" @@ -46,208 +42,18 @@ #include #include -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 -# include +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 +# include #endif -using namespace rocprofsys; - namespace rocprofsys { namespace rocm { -std::mutex rocm_mutex = {}; -bool is_loaded = false; -bool on_load_trace = (get_env("ROCP_ONLOAD_TRACE", 0) > 0); +std::vector +rocm_events() +{ + return rocprofiler_sdk::get_rocm_events_info(); +} } // namespace rocm } // namespace rocprofsys - -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 -std::ostream& -operator<<(std::ostream& _os, const rocprofiler_settings_t& _v) -{ -# define ROCPROF_SETTING_FIELD_STR(NAME) JOIN('=', # NAME, _v.NAME) - - _os << JOIN( - ", ", ROCPROF_SETTING_FIELD_STR(intercept_mode), - ROCPROF_SETTING_FIELD_STR(code_obj_tracking), - ROCPROF_SETTING_FIELD_STR(memcopy_tracking), - ROCPROF_SETTING_FIELD_STR(trace_size), ROCPROF_SETTING_FIELD_STR(trace_local), - ROCPROF_SETTING_FIELD_STR(timeout), ROCPROF_SETTING_FIELD_STR(timestamp_on), - ROCPROF_SETTING_FIELD_STR(hsa_intercepting), - ROCPROF_SETTING_FIELD_STR(k_concurrent), ROCPROF_SETTING_FIELD_STR(opt_mode), - ROCPROF_SETTING_FIELD_STR(obj_dumping)); - return _os; -} -#endif - -// HSA-runtime tool on-load method -extern "C" -{ -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 - void OnUnloadTool() - { - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n"); - - rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock }; - if(!_lk.owns_lock()) _lk.lock(); - - if(!rocm::is_loaded) - { - ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, - "rocprofiler is not loaded\n"); - return; - } - rocm::is_loaded = false; - - _lk.unlock(); - - // stop_top_level_timer_if_necessary(); - // Final resources cleanup - rocprofsys::rocprofiler::rocm_cleanup(); - } - - void OnLoadToolProp(rocprofiler_settings_t* settings) - { - using ::rocprofiler::util::HsaRsrcFactory; - - if(!config::get_use_rocprofiler() || config::get_rocm_events().empty()) return; - - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n"); - - rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock }; - if(!_lk.owns_lock()) _lk.lock(); - - if(rocm::is_loaded) - { - ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, - "rocprofiler is already loaded\n"); - return; - } - rocm::is_loaded = true; - - _lk.unlock(); - - // Enable timestamping - settings->timestamp_on = 1; - settings->intercept_mode = 1; - settings->hsa_intercepting = 1; - settings->k_concurrent = 0; - settings->obj_dumping = 0; - // settings->code_obj_tracking = 0; - // settings->memcopy_tracking = 0; - // settings->trace_local = 1; - // settings->opt_mode = 1; - // settings->trace_size = 0; - // settings->timeout = 0; - - ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "rocprofiler settings: %s\n", - JOIN("", *settings).c_str()); - - // Initialize profiling - rocprofsys::rocprofiler::rocm_initialize(); - HsaRsrcFactory::Instance().PrintGpuAgents("ROCm"); - } -#endif - - bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, - const char* const* failed_tool_names) - { - tim::consume_parameters(table, runtime_version, failed_tool_count, - failed_tool_names); - - static bool _once = false; - if(_once) return true; - _once = true; - - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n"); - ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false); - - if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) return true; - if(!tim::settings::enabled()) return true; - - roctracer_is_init() = true; - ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "Loading ROCm tooling...\n"); - - if(!config::settings_are_configured() && get_state() < State::Active) - rocprofsys_init_tooling_hidden(); - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - -#if ROCPROFSYS_HIP_VERSION < 50300 - ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace, - "Computing the roctracer clock skew...\n"); - (void) rocprofsys::get_clock_skew(); -#endif - - if(get_use_process_sampling() && get_use_rocm_smi()) - { - ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace, - "Setting rocm_smi state to active...\n"); - rocm_smi::set_state(State::Active); - } - - comp::roctracer::setup(static_cast(table), rocm::on_load_trace); - -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 - bool _force_rocprofiler_init = - tim::get_env("ROCPROFSYS_FORCE_ROCPROFILER_INIT", false, false); -#else - bool _force_rocprofiler_init = false; -#endif - - bool _success = true; - bool _is_empty = - (config::settings_are_configured() && config::get_rocm_events().empty()); - if(_force_rocprofiler_init || (get_use_rocprofiler() && !_is_empty)) - { -#if ROCPROFSYS_HIP_VERSION < 50500 - auto _rocprof = dynamic_library{ - "ROCPROFSYS_ROCPROFILER_LIBRARY", - find_library_path( - "librocprofiler64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" }, - { ROCPROFSYS_DEFAULT_ROCM_PATH }, - { "lib", "lib64", "rocprofiler/lib", "rocprofiler/lib64" }), - (RTLD_LAZY | RTLD_GLOBAL), false - }; - - ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace, - "Loading rocprofiler library (%s=%s)...\n", - _rocprof.envname.c_str(), _rocprof.filename.c_str()); - _rocprof.open(); - - on_load_t _rocprof_load = nullptr; - _success = _rocprof.invoke("OnLoad", _rocprof_load, table, runtime_version, - failed_tool_count, failed_tool_names); - ROCPROFSYS_CONDITIONAL_PRINT_F(!_success, - "Warning! Invoking rocprofiler's OnLoad " - "failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n", - _rocprof.filename.c_str()); - ROCPROFSYS_CI_THROW(!_success, - "Warning! Invoking rocprofiler's OnLoad " - "failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n", - _rocprof.filename.c_str()); -#endif - } - else - { - using ::rocprofiler::util::HsaRsrcFactory; - - HsaRsrcFactory::Instance().PrintGpuAgents("ROCm"); - } - - gpu::add_hip_device_metadata(); - - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading... %s\n", - (_success) ? "Done" : "Failed"); - return _success; - } - - // HSA-runtime on-unload method - void OnUnload() - { - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n"); - rocprofsys_finalize_hidden(); - ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading... Done\n"); - } -} diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.hpp index b5b1808f65..131f0d13ea 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.hpp @@ -23,36 +23,48 @@ #pragma once #include "core/defines.hpp" +#include "core/timemory.hpp" -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 -# include +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 +# include +# include #endif #include #include +#include namespace rocprofsys { namespace rocm { -using lock_t = std::unique_lock; +using hardware_counter_info = ::tim::hardware_counters::info; -extern std::mutex rocm_mutex; -extern bool is_loaded; +std::vector +rocm_events(); + +#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0 +inline std::vector +rocm_events() +{ + return std::vector(); +} +#endif } // namespace rocm } // namespace rocprofsys extern "C" { - struct HsaApiTable; - using on_load_t = bool (*)(HsaApiTable*, uint64_t, uint64_t, const char* const*); + struct rocprofiler_tool_configure_result_t; + struct rocprofiler_client_id_t; - bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, - const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API; - void OnUnload() ROCPROFSYS_PUBLIC_API; + using rocprofiler_configure_t = + rocprofiler_tool_configure_result_t* (*) (uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* client_id); -#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0 - void OnLoadToolProp(rocprofiler_settings_t* settings) ROCPROFSYS_PUBLIC_API; - void OnUnloadTool() ROCPROFSYS_PUBLIC_API; -#endif + rocprofiler_tool_configure_result_t* rocprofiler_configure( + uint32_t version, const char* runtime_version, uint32_t priority, + rocprofiler_client_id_t* client_id) ROCPROFSYS_PUBLIC_API; } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/CMakeLists.txt deleted file mode 100644 index 0136d5d185..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -if(ROCPROFSYS_USE_ROCPROFILER OR ROCPROFSYS_USE_ROCTRACER) - target_sources( - rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.hpp - ${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.cpp) -endif() diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.cpp deleted file mode 100644 index ecb010cdec..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.cpp +++ /dev/null @@ -1,1027 +0,0 @@ -/****************************************************************************** -Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ - -#include "library/rocm/hsa_rsrc_factory.hpp" -#include "core/debug.hpp" -#include "core/defines.hpp" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace rocprofiler -{ -namespace util -{ -// Demangle C++ symbol name -static const char* -cpp_demangle(const char* symname) -{ - size_t size = 0; - int status; - const char* ret = abi::__cxa_demangle(symname, nullptr, &size, &status); - return (ret != nullptr) ? ret : strdup(symname); -} - -// Callback function to get available in the system agents -hsa_status_t -HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) -{ - hsa_status_t status = HSA_STATUS_ERROR; - HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); - const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); - if(agent_info != nullptr) status = HSA_STATUS_SUCCESS; - return status; -} - -// This function checks to see if the provided -// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, -// the function adds an additional requirement that the pool have the -// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, -// pools must NOT have this property. -// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is -// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but -// no pool was found meeting the requirements. If an error is encountered, we -// return that error. -static hsa_status_t -FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) -{ - hsa_status_t err; - hsa_amd_segment_t segment; - uint32_t flag; - - if(nullptr == data) - { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - CHECK_STATUS("hsa_amd_memory_pool_get_info", err); - if(HSA_AMD_SEGMENT_GLOBAL != segment) - { - return HSA_STATUS_SUCCESS; - } - - err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - CHECK_STATUS("hsa_amd_memory_pool_get_info", err); - - uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; - - if((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) - { - return HSA_STATUS_SUCCESS; - } - - *(reinterpret_cast(data)) = pool; - return HSA_STATUS_INFO_BREAK; -} - -// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that -// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT -// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT -hsa_status_t -FindStandardPool(hsa_amd_memory_pool_t pool, void* data) -{ - return FindGlobalPool(pool, data, false); -} - -// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that -// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS -// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT -hsa_status_t -FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) -{ - return FindGlobalPool(pool, data, true); -} - -// Constructor of the class -HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) -: initialize_hsa_(initialize_hsa) -{ - hsa_status_t status; - - cpu_pool_ = nullptr; - kern_arg_pool_ = nullptr; - - InitHsaApiTable(nullptr); - - // Initialize the Hsa Runtime - if(initialize_hsa_) - { - status = hsa_api_.hsa_init(); - CHECK_STATUS("Error in hsa_init", status); - } - - // Discover the set of Gpu devices available on the platform - status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); - CHECK_STATUS("Error Calling hsa_iterate_agents", status); - if(cpu_pool_ == nullptr) - CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); - if(kern_arg_pool_ == nullptr) - CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); - - // Get AqlProfile API table - aqlprofile_api_ = {}; - memset(&aqlprofile_api_, 0, sizeof(aqlprofile_api_)); -#ifdef ROCP_LD_AQLPROFILE - status = LoadAqlProfileLib(&aqlprofile_api_); -#else - status = hsa_api_.hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, - sizeof(aqlprofile_api_), &aqlprofile_api_); -#endif - CHECK_STATUS("aqlprofile API table load failed", status); - - // Get Loader API table - loader_api_ = {}; - memset(&loader_api_, 0, sizeof(loader_api_)); - status = hsa_api_.hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); - CHECK_STATUS("loader API table query failed", status); - - // Instantiate HSA timer - timer_ = new HsaTimer(&hsa_api_); - CHECK_STATUS("HSA timer allocation failed", - (timer_ == nullptr) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); - - // Time correlation - const uint32_t corr_iters = 1000; - for(unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) - { - CorrelateTime((HsaTimer::time_id_t) time_id, corr_iters); - } - - // System timeout - timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) - ? timeout_ns_ - : timer_->ns_to_sysclock(timeout_ns_); - - // To dump code objects - to_dump_code_obj_ = getenv("ROCP_DUMP_CODEOBJ"); -} - -// Destructor of the class -HsaRsrcFactory::~HsaRsrcFactory() -{ - delete timer_; - for(const auto* p : cpu_list_) - delete p; - for(const auto* p : gpu_list_) - delete p; - if(initialize_hsa_) - { - hsa_status_t status = hsa_api_.hsa_shut_down(); - try - { - CHECK_STATUS("Error in hsa_shut_down", status); - } catch(std::runtime_error& _e) - { - fflush(stderr); - fprintf(stderr, "%s\n", _e.what()); - fflush(stderr); - abort(); - } - } -} - -void -HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) -{ - std::lock_guard lck(mutex_); - - if(hsa_api_.hsa_init == nullptr) - { - if(table != nullptr) - { - hsa_api_.hsa_init = table->core_->hsa_init_fn; - hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; - hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; - hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; - - hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; - hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; - hsa_api_.hsa_queue_load_read_index_relaxed = - table->core_->hsa_queue_load_read_index_relaxed_fn; - hsa_api_.hsa_queue_load_write_index_relaxed = - table->core_->hsa_queue_load_write_index_relaxed_fn; - hsa_api_.hsa_queue_add_write_index_scacq_screl = - table->core_->hsa_queue_add_write_index_scacq_screl_fn; - - hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; - hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; - hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; - hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; - hsa_api_.hsa_signal_wait_scacquire = - table->core_->hsa_signal_wait_scacquire_fn; - hsa_api_.hsa_signal_store_screlease = - table->core_->hsa_signal_store_screlease_fn; - - hsa_api_.hsa_code_object_reader_create_from_file = - table->core_->hsa_code_object_reader_create_from_file_fn; - hsa_api_.hsa_executable_create_alt = - table->core_->hsa_executable_create_alt_fn; - hsa_api_.hsa_executable_load_agent_code_object = - table->core_->hsa_executable_load_agent_code_object_fn; - hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; - hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; - hsa_api_.hsa_executable_get_symbol = - table->core_->hsa_executable_get_symbol_fn; - hsa_api_.hsa_executable_symbol_get_info = - table->core_->hsa_executable_symbol_get_info_fn; - hsa_api_.hsa_executable_iterate_symbols = - table->core_->hsa_executable_iterate_symbols_fn; - - hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; - hsa_api_.hsa_system_get_major_extension_table = - table->core_->hsa_system_get_major_extension_table_fn; - - hsa_api_.hsa_amd_agent_iterate_memory_pools = - table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; - hsa_api_.hsa_amd_memory_pool_get_info = - table->amd_ext_->hsa_amd_memory_pool_get_info_fn; - hsa_api_.hsa_amd_memory_pool_allocate = - table->amd_ext_->hsa_amd_memory_pool_allocate_fn; - hsa_api_.hsa_amd_agents_allow_access = - table->amd_ext_->hsa_amd_agents_allow_access_fn; - hsa_api_.hsa_amd_memory_async_copy = - table->amd_ext_->hsa_amd_memory_async_copy_fn; - - hsa_api_.hsa_amd_signal_async_handler = - table->amd_ext_->hsa_amd_signal_async_handler_fn; - hsa_api_.hsa_amd_profiling_set_profiler_enabled = - table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; - hsa_api_.hsa_amd_profiling_get_async_copy_time = - table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; - hsa_api_.hsa_amd_profiling_get_dispatch_time = - table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; - } - else - { - hsa_api_.hsa_init = hsa_init; - hsa_api_.hsa_shut_down = hsa_shut_down; - hsa_api_.hsa_agent_get_info = hsa_agent_get_info; - hsa_api_.hsa_iterate_agents = hsa_iterate_agents; - - hsa_api_.hsa_queue_create = hsa_queue_create; - hsa_api_.hsa_queue_destroy = hsa_queue_destroy; - hsa_api_.hsa_queue_load_read_index_relaxed = - hsa_queue_load_read_index_relaxed; - hsa_api_.hsa_queue_load_write_index_relaxed = - hsa_queue_load_write_index_relaxed; - hsa_api_.hsa_queue_add_write_index_scacq_screl = - hsa_queue_add_write_index_scacq_screl; - - hsa_api_.hsa_signal_create = hsa_signal_create; - hsa_api_.hsa_signal_destroy = hsa_signal_destroy; - hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; - hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; - hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; - hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; - - hsa_api_.hsa_code_object_reader_create_from_file = - hsa_code_object_reader_create_from_file; - hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; - hsa_api_.hsa_executable_load_agent_code_object = - hsa_executable_load_agent_code_object; - hsa_api_.hsa_executable_freeze = hsa_executable_freeze; - hsa_api_.hsa_executable_destroy = hsa_executable_destroy; - hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; - hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; - hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; - - hsa_api_.hsa_system_get_info = hsa_system_get_info; - hsa_api_.hsa_system_get_major_extension_table = - hsa_system_get_major_extension_table; - - hsa_api_.hsa_amd_agent_iterate_memory_pools = - hsa_amd_agent_iterate_memory_pools; - hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; - hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; - hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; - hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; - - hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; - hsa_api_.hsa_amd_profiling_set_profiler_enabled = - hsa_amd_profiling_set_profiler_enabled; - hsa_api_.hsa_amd_profiling_get_async_copy_time = - hsa_amd_profiling_get_async_copy_time; - hsa_api_.hsa_amd_profiling_get_dispatch_time = - hsa_amd_profiling_get_dispatch_time; - } - } -} - -hsa_status_t -HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) -{ - void* handle = dlopen(kAqlProfileLib, RTLD_NOW); - if(handle == nullptr) - { - fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); - return HSA_STATUS_ERROR; - } - dlerror(); /* Clear any existing error */ - - api->hsa_ven_amd_aqlprofile_error_string = - (decltype(::hsa_ven_amd_aqlprofile_error_string)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_error_string"); - api->hsa_ven_amd_aqlprofile_validate_event = - (decltype(::hsa_ven_amd_aqlprofile_validate_event)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_validate_event"); - api->hsa_ven_amd_aqlprofile_start = (decltype(::hsa_ven_amd_aqlprofile_start)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_start"); - api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_stop"); -#ifdef AQLPROF_NEW_API - api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_read"); -#endif - api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = - (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); - api->hsa_ven_amd_aqlprofile_get_info = - (decltype(::hsa_ven_amd_aqlprofile_get_info)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_get_info"); - api->hsa_ven_amd_aqlprofile_iterate_data = - (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*) dlsym( - handle, "hsa_ven_amd_aqlprofile_iterate_data"); - - return HSA_STATUS_SUCCESS; -} - -// Add system agent info -const AgentInfo* -HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) -{ - // Determine if device is a Gpu agent - hsa_status_t status; - AgentInfo* agent_info = nullptr; - - hsa_device_type_t type; - status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); - CHECK_STATUS("Error Calling hsa_agent_get_info", status); - - if(type == HSA_DEVICE_TYPE_CPU) - { - agent_info = new AgentInfo{}; - agent_info->dev_id = agent; - agent_info->dev_type = HSA_DEVICE_TYPE_CPU; - agent_info->dev_index = cpu_list_.size(); - - status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, - &agent_info->cpu_pool); - if((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == nullptr)) - cpu_pool_ = &agent_info->cpu_pool; - status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, - &agent_info->kern_arg_pool); - if((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == nullptr)) - kern_arg_pool_ = &agent_info->kern_arg_pool; - agent_info->gpu_pool = {}; - - cpu_list_.push_back(agent_info); - cpu_agents_.push_back(agent); - } - - if(type == HSA_DEVICE_TYPE_GPU) - { - agent_info = new AgentInfo{}; - agent_info->dev_id = agent; - agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); - const int gfxip_label_len = - std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1); - memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len); - agent_info->gfxip[gfxip_label_len] = '\0'; - hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, - &agent_info->max_wave_size); - hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &agent_info->max_queue_size); - hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); - agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; - hsa_api_.hsa_agent_get_info( - agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), - &agent_info->cu_num); - hsa_api_.hsa_agent_get_info( - agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), - &agent_info->waves_per_cu); - hsa_api_.hsa_agent_get_info( - agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), - &agent_info->simds_per_cu); - hsa_api_.hsa_agent_get_info( - agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), - &agent_info->se_num); - hsa_api_.hsa_agent_get_info( - agent, - static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), - &agent_info->shader_arrays_per_se); - - agent_info->cpu_pool = {}; - agent_info->kern_arg_pool = {}; - status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, - &agent_info->gpu_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); - - // GFX8 and GFX9 SGPR/VGPR block sizes - agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; - agent_info->sgpr_block_size = 8; - agent_info->vgpr_block_size = 4; - - // Set GPU index - /*uint32_t driver_node_id; - status = hsa_api_.hsa_agent_get_info( - agent, static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), - &driver_node_id); - CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status); - agent_info->dev_index = driver_node_id;*/ - // disable this change above (found in the rocprofiler library) - // because it breaks the lookup for rocprofiler_pool_fetch - // lookup in rocprofiler.cpp. On my system (one AMD GPU and one NVIDIA GPU), - // it has a value of 1, not 0 and the pool size is 1 - agent_info->dev_index = gpu_list_.size(); - gpu_list_.push_back(agent_info); - gpu_agents_.push_back(agent); - } - - if(agent_info) agent_map_[agent.handle] = agent_info; - - return agent_info; -} - -// Return systen agent info -const AgentInfo* -HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) -{ - const AgentInfo* agent_info = nullptr; - auto it = agent_map_.find(agent.handle); - if(it != agent_map_.end()) - { - agent_info = it->second; - } - return agent_info; -} - -// Get the count of Hsa Gpu Agents available on the platform -// -// @return uint32_t Number of Gpu agents on platform -// -uint32_t -HsaRsrcFactory::GetCountOfGpuAgents() -{ - return uint32_t(gpu_list_.size()); -} - -// Get the count of Hsa Cpu Agents available on the platform -// -// @return uint32_t Number of Cpu agents on platform -// -uint32_t -HsaRsrcFactory::GetCountOfCpuAgents() -{ - return uint32_t(cpu_list_.size()); -} - -// Get the AgentInfo handle of a Gpu device -// -// @param idx Gpu Agent at specified index -// -// @param agent_info Output parameter updated with AgentInfo -// -// @return bool true if successful, false otherwise -// -bool -HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) -{ - // Determine if request is valid - uint32_t size = uint32_t(gpu_list_.size()); - if(idx >= size) - { - return false; - } - - // Copy AgentInfo from specified index - *agent_info = gpu_list_[idx]; - - return true; -} - -// Get the AgentInfo handle of a Cpu device -// -// @param idx Cpu Agent at specified index -// -// @param agent_info Output parameter updated with AgentInfo -// -// @return bool true if successful, false otherwise -// -bool -HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) -{ - // Determine if request is valid - uint32_t size = uint32_t(cpu_list_.size()); - if(idx >= size) - { - return false; - } - - // Copy AgentInfo from specified index - *agent_info = cpu_list_[idx]; - return true; -} - -// Create a Queue object and return its handle. The queue object is expected -// to support user requested number of Aql dispatch packets. -// -// @param agent_info Gpu Agent on which to create a queue object -// -// @param num_Pkts Number of packets to be held by queue -// -// @param queue Output parameter updated with handle of queue object -// -// @return bool true if successful, false otherwise -// -bool // NOLINTNEXTLINE(readability-convert-member-functions-to-static) -HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, - hsa_queue_t** queue) -{ - hsa_status_t status; - status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, - nullptr, nullptr, UINT32_MAX, UINT32_MAX, queue); - return (status == HSA_STATUS_SUCCESS); -} - -// Create a Signal object and return its handle. -// @param value Initial value of signal object -// @param signal Output parameter updated with handle of signal object -// @return bool true if successful, false otherwise -bool // NOLINTNEXTLINE(readability-convert-member-functions-to-static) -HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) -{ - hsa_status_t status; - status = hsa_api_.hsa_signal_create(value, 0, nullptr, signal); - return (status == HSA_STATUS_SUCCESS); -} - -// Allocate memory for use by a kernel of specified size in specified -// agent's memory region. -// @param agent_info Agent from whose memory region to allocate -// @param size Size of memory in terms of bytes -// @return uint8_t* Pointer to buffer, null if allocation fails. -uint8_t* // NOLINTNEXTLINE(readability-convert-member-functions-to-static) -HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) -{ - hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = nullptr; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, - reinterpret_cast(&buffer)); - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; - return ptr; -} - -// Allocate memory to pass kernel parameters. -// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo -// parameter. -// @param agent_info Agent from whose memory region to allocate -// @param size Size of memory in terms of bytes -// @return uint8_t* Pointer to buffer, null if allocation fails. -uint8_t* -HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) -{ - hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = nullptr; - if(!cpu_agents_.empty()) - { - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, - reinterpret_cast(&buffer)); - // Both the CPU and GPU can access the kernel arguments - if(status == HSA_STATUS_SUCCESS) - { - hsa_agent_t ag_list[1] = { agent_info->dev_id }; - status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer); - } - } - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; - return ptr; -} - -// Allocate system memory accessible by both CPU and GPU -// @param agent_info Agent from whose memory region to allocate -// @param size Size of memory in terms of bytes -// @return uint8_t* Pointer to buffer, null if allocation fails. -uint8_t* -HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) -{ - hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = nullptr; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - if(!cpu_agents_.empty()) - { - status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, - reinterpret_cast(&buffer)); - // Both the CPU and GPU can access the memory - if(status == HSA_STATUS_SUCCESS) - { - hsa_agent_t ag_list[1] = { agent_info->dev_id }; - status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer); - } - } - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; - return ptr; -} - -// Allocate memory for command buffer. -// @param agent_info Agent from whose memory region to allocate -// @param size Size of memory in terms of bytes -// @return uint8_t* Pointer to buffer, null if allocation fails. -uint8_t* -HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) -{ - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) - ? reinterpret_cast( - mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_SHARED | MAP_ANONYMOUS, 0, 0)) - : AllocateSysMemory(agent_info, size); - return ptr; -} - -// Wait signal -hsa_signal_value_t -HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, - const hsa_signal_value_t& signal_value) const -{ - const hsa_signal_value_t exp_value = signal_value - 1; - hsa_signal_value_t ret_value = signal_value; - while(true) - { - ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, - signal_value, timeout_, - HSA_WAIT_STATE_BLOCKED); - if(ret_value == exp_value) break; - if(ret_value != signal_value) - { - std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" - << signal_value << "), ret_value(" << ret_value << ")" << std::endl - << std::flush; - abort(); - } - } - return ret_value; -} - -// Wait signal with signal value restore -void -HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, - const hsa_signal_value_t& signal_value) const -{ - SignalWait(signal, signal_value); - hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); -} - -// Copy data from GPU to host memory -bool -HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) -{ - hsa_status_t status = HSA_STATUS_ERROR; - if(!cpu_agents_.empty()) - { - hsa_signal_t s = {}; - status = hsa_api_.hsa_signal_create(1, 0, nullptr, &s); - CHECK_STATUS("hsa_signal_create()", status); - status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, - 0, nullptr, s); - CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s, 1); - status = hsa_api_.hsa_signal_destroy(s); - CHECK_STATUS("hsa_signal_destroy()", status); - } - return (status == HSA_STATUS_SUCCESS); -} - -bool -HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, - size_t size) -{ - return Memcpy(agent_info->dev_id, dst, src, size); -} - -// Memory free method -bool -HsaRsrcFactory::FreeMemory(void* ptr) -{ - const hsa_status_t status = hsa_memory_free(ptr); - CHECK_STATUS("hsa_memory_free", status); - return (status == HSA_STATUS_SUCCESS); -} - -// Loads an Assembled Brig file and Finalizes it into Device Isa -// @param agent_info Gpu device for which to finalize -// @param brig_path File path of the Assembled Brig file -// @param kernel_name Name of the kernel to finalize -// @param code_desc Handle of finalized Code Descriptor that could -// be used to submit for execution -// @return bool true if successful, false otherwise -bool // NOLINTNEXTLINE(readability-convert-member-functions-to-static) -HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, - const char* kernel_name, hsa_executable_t* executable, - hsa_executable_symbol_t* code_desc) -{ - hsa_status_t status = HSA_STATUS_ERROR; - - // Build the code object filename - std::string filename(brig_path); - std::clog << "Code object filename: " << filename << std::endl; - - // Open the file containing code object - hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); - if(file_handle == -1) - { - std::cerr << "Error: failed to load '" << filename << "'" << std::endl; - assert(false); - return false; - } - - // Create code object reader - hsa_code_object_reader_t code_obj_rdr = { 0 }; - status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); - if(status != HSA_STATUS_SUCCESS) - { - std::cerr << "Failed to create code object reader '" << filename << "'" - << std::endl; - return false; - } - - // Create executable. - status = hsa_api_.hsa_executable_create_alt( - HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, executable); - CHECK_STATUS("Error in creating executable object", status); - - // Load code object. - status = hsa_api_.hsa_executable_load_agent_code_object( - *executable, agent_info->dev_id, code_obj_rdr, nullptr, nullptr); - CHECK_STATUS("Error in loading executable object", status); - - // Freeze executable. - status = hsa_api_.hsa_executable_freeze(*executable, ""); - CHECK_STATUS("Error in freezing executable object", status); - - // Get symbol handle. - hsa_executable_symbol_t kernelSymbol; - status = hsa_api_.hsa_executable_get_symbol(*executable, nullptr, kernel_name, - agent_info->dev_id, 0, &kernelSymbol); - CHECK_STATUS("Error in looking up kernel symbol", status); - - close(file_handle); - - // Update output parameter - *code_desc = kernelSymbol; - return true; -} - -// Print the various fields of Hsa Gpu Agents -bool -HsaRsrcFactory::PrintGpuAgents(const std::string&) -{ - std::vector _agents = {}; - for(const auto* itr : gpu_list_) - { - if(itr) _agents.emplace_back(*itr); - } - - ROCPROFSYS_METADATA([_agents](auto& ar) { - namespace cereal = ::tim::cereal; - - ar.setNextName("rocm_agents"); - ar.startNode(); - ar.makeArray(); - for(auto itr : _agents) - { - ar.startNode(); - ar(cereal::make_nvp("name", std::string{ itr.name }), - cereal::make_nvp("is_apu", itr.is_apu), - cereal::make_nvp("hsa_profile", itr.profile), - cereal::make_nvp("max_wave_size", itr.max_wave_size), - cereal::make_nvp("max_queue_size", itr.max_queue_size), - cereal::make_nvp("cu_number", itr.cu_num), - cereal::make_nvp("waves_per_cu", itr.waves_per_cu), - cereal::make_nvp("simds_per_cu", itr.simds_per_cu), - cereal::make_nvp("se_num", itr.se_num), - cereal::make_nvp("shader_arrays_per_se", itr.shader_arrays_per_se)); - ar.finishNode(); - } - ar.finishNode(); - }); - - return true; -} - -void* -HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx) -{ - const uint32_t slot_size_b = CMD_SLOT_SIZE_B; - const uint32_t slot_idx = (uint32_t)(idx % queue->size); - void* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + - (slot_idx * slot_size_b)); - return queue_slot; -} - -void* -HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) -{ - const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue); - return GetSlotPointer(queue, read_idx); -} - -uint64_t -HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) -{ - const uint32_t slot_size_b = CMD_SLOT_SIZE_B; - - // adevance command queue - const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1); - while((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) - { - sched_yield(); - } - - const uint32_t slot_idx = (uint32_t)(write_idx % queue->size); - uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + - (slot_idx * slot_size_b)); - const uint32_t* slot_data = reinterpret_cast(packet); - - // Copy buffered commands into the queue slot. - // Overwrite the AQL invalid header (first dword) last. - // This prevents the slot from being read until it's fully written. - memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); - std::atomic* header_atomic_ptr = - reinterpret_cast*>(&queue_slot[0]); - header_atomic_ptr->store(slot_data[0], std::memory_order_release); - - // ringdoor bell - hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); - - return write_idx; -} - -uint64_t -HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) -{ - const uint32_t slot_size_b = CMD_SLOT_SIZE_B; - if((size_bytes & (slot_size_b - 1)) != 0) - { - fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); - abort(); - } - - const char* begin = reinterpret_cast(packet); - const char* end = begin + size_bytes; - uint64_t write_idx = 0; - for(const char* ptr = begin; ptr < end; ptr += slot_size_b) - { - write_idx = Submit(queue, ptr); - } - - return write_idx; -} - -const char* -HsaRsrcFactory::GetKernelNameRef(uint64_t addr) -{ - std::lock_guard lck(mutex_); - const auto it = symbols_map_->find(addr); - if(it == symbols_map_->end()) - { - fprintf(stderr, - "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", - addr); - abort(); - } - return it->second; -} - -void -HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) -{ - std::lock_guard lck(mutex_); - executable_tracking_on_ = true; - table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; - table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; -} - -hsa_status_t -HsaRsrcFactory::executable_symbols_cb(hsa_executable_t /*exec*/, - hsa_executable_symbol_t symbol, void* data) -{ - hsa_symbol_kind_t value = (hsa_symbol_kind_t) 0; - hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); - CHECK_STATUS("Error in getting symbol info", status); - if(value == HSA_SYMBOL_KIND_KERNEL) - { - uint64_t addr = 0; - uint32_t len = 0; - status = hsa_api_.hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); - CHECK_STATUS("Error in getting kernel object", status); - status = hsa_api_.hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); - CHECK_STATUS("Error in getting name len", status); - char* symname = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname); - CHECK_STATUS("Error in getting kernel name", status); - symname[len] = 0; - if(data == nullptr) - { - const char* name = cpp_demangle(symname); - auto ret = symbols_map_->insert({ addr, name }); - if(ret.second == false) - { - delete[] ret.first->second; - ret.first->second = name; - } - } - else - { - symbols_map_->erase(addr); - } - delete[] symname; - } - return HSA_STATUS_SUCCESS; -} - -hsa_status_t -HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, - const char* options) -{ - std::lock_guard lck(mutex_); - if(symbols_map_ == nullptr) symbols_map_ = new symbols_map_t; - hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols( - executable, executable_symbols_cb, nullptr); - CHECK_STATUS("Error in iterating executable symbols", status); - return hsa_api_.hsa_executable_freeze(executable, options); -} - -hsa_status_t -HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) -{ - std::lock_guard lck(mutex_); - if(symbols_map_ != nullptr) - { - hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols( - executable, executable_symbols_cb, (void*) 1); - CHECK_STATUS("Error in iterating executable symbols", status); - } - return hsa_api_.hsa_executable_destroy(executable); -} - -std::atomic HsaRsrcFactory::instance_{}; -HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; -HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; -hsa_pfn_t HsaRsrcFactory::hsa_api_{}; -bool HsaRsrcFactory::executable_tracking_on_ = false; -HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = nullptr; -void* HsaRsrcFactory::to_dump_code_obj_ = nullptr; - -} // namespace util -} // namespace rocprofiler diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.hpp deleted file mode 100644 index 9e255ce98a..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm/hsa_rsrc_factory.hpp +++ /dev/null @@ -1,582 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "core/exception.hpp" - -#define AMD_INTERNAL_BUILD 1 - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define HSA_ARGUMENT_ALIGN_BYTES 16 -#define HSA_QUEUE_ALIGN_BYTES 64 -#define HSA_PACKET_ALIGN_BYTES 64 -#define HSA_MESSAGE_LENGTH 4096 - -#define CHECK_STATUS(msg, status) \ - do \ - { \ - if((status) != HSA_STATUS_SUCCESS) \ - { \ - const char* emsg = 0; \ - hsa_status_string(status, &emsg); \ - char _buffer[HSA_MESSAGE_LENGTH]; \ - snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \ - emsg ? emsg : ""); \ - throw ::rocprofsys::exception(_buffer); \ - } \ - } while(0) - -#define CHECK_ITER_STATUS(msg, status) \ - do \ - { \ - if((status) != HSA_STATUS_INFO_BREAK) \ - { \ - const char* emsg = 0; \ - hsa_status_string(status, &emsg); \ - char _buffer[HSA_MESSAGE_LENGTH]; \ - snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \ - emsg ? emsg : ""); \ - throw ::rocprofsys::exception(_buffer); \ - } \ - } while(0) - -namespace rocprofiler -{ -namespace util -{ -static const size_t MEM_PAGE_BYTES = 0x1000; -static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; -typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; - -struct hsa_pfn_t -{ - decltype(::hsa_init)* hsa_init; - decltype(::hsa_shut_down)* hsa_shut_down; - decltype(::hsa_agent_get_info)* hsa_agent_get_info; - decltype(::hsa_iterate_agents)* hsa_iterate_agents; - - decltype(::hsa_queue_create)* hsa_queue_create; - decltype(::hsa_queue_destroy)* hsa_queue_destroy; - decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; - decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; - decltype( - ::hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl; - - decltype(::hsa_signal_create)* hsa_signal_create; - decltype(::hsa_signal_destroy)* hsa_signal_destroy; - decltype(::hsa_signal_load_relaxed)* hsa_signal_load_relaxed; - decltype(::hsa_signal_store_relaxed)* hsa_signal_store_relaxed; - decltype(::hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; - decltype(::hsa_signal_store_screlease)* hsa_signal_store_screlease; - - decltype(::hsa_code_object_reader_create_from_file)* - hsa_code_object_reader_create_from_file; - decltype(::hsa_executable_create_alt)* hsa_executable_create_alt; - decltype( - ::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; - decltype(::hsa_executable_freeze)* hsa_executable_freeze; - decltype(::hsa_executable_destroy)* hsa_executable_destroy; - decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol; - decltype(::hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; - decltype(::hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; - - decltype(::hsa_system_get_info)* hsa_system_get_info; - decltype( - ::hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; - - decltype(::hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; - decltype(::hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; - decltype(::hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; - decltype(::hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; - decltype(::hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; - - decltype(::hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; - decltype( - ::hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; - decltype( - ::hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; - decltype(::hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; -}; - -// Encapsulates information about a Hsa Agent such as its -// handle, name, max queue size, max wavefront size, etc. -struct AgentInfo -{ - // Handle of Agent - hsa_agent_t dev_id; - - // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 - uint32_t dev_type; - - // APU flag - bool is_apu; - - // Agent system index - uint32_t dev_index; - - // GFXIP name - char gfxip[64]; - - // Name of Agent whose length is less than 64 - char name[64]; - - // Max size of Wavefront size - uint32_t max_wave_size; - - // Max size of Queue buffer - uint32_t max_queue_size; - - // Hsail profile supported by agent - hsa_profile_t profile; - - // CPU/GPU/kern-arg memory pools - hsa_amd_memory_pool_t cpu_pool; - hsa_amd_memory_pool_t gpu_pool; - hsa_amd_memory_pool_t kern_arg_pool; - - // The number of compute unit available in the agent. - uint32_t cu_num; - - // Maximum number of waves possible in a Compute Unit. - uint32_t waves_per_cu; - - // Number of SIMD's per compute unit CU - uint32_t simds_per_cu; - - // Number of Shader Engines (SE) in Gpu - uint32_t se_num; - - // Number of Shader Arrays Per Shader Engines in Gpu - uint32_t shader_arrays_per_se; - - // SGPR/VGPR/LDS block sizes - uint32_t sgpr_block_dflt; - uint32_t sgpr_block_size; - uint32_t vgpr_block_size; - static const uint32_t lds_block_size = 128 * 4; -}; - -// HSA timer class -// Provides current HSA timestampa and system-clock/ns conversion API -class HsaTimer -{ -public: - typedef uint64_t timestamp_t; - static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; - typedef long double freq_t; - - enum time_id_t - { - TIME_ID_CLOCK_REALTIME = 0, - TIME_ID_CLOCK_REALTIME_COARSE = 1, - TIME_ID_CLOCK_MONOTONIC = 2, - TIME_ID_CLOCK_MONOTONIC_COARSE = 3, - TIME_ID_CLOCK_MONOTONIC_RAW = 4, - TIME_ID_NUMBER - }; - - HsaTimer(const hsa_pfn_t* hsa_api) - : hsa_api_(hsa_api) - { - timestamp_t sysclock_hz = 0; - hsa_status_t status = hsa_api_->hsa_system_get_info( - HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); - CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); - sysclock_factor_ = (freq_t) 1000000000 / (freq_t) sysclock_hz; - } - - // Methods for system-clock/ns conversion - timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const - { - return timestamp_t((freq_t) sysclock * sysclock_factor_); - } - timestamp_t ns_to_sysclock(const timestamp_t& time) const - { - return timestamp_t((freq_t) time / sysclock_factor_); - } - - // Method for timespec/ns conversion - static timestamp_t timespec_to_ns(const timespec& time) - { - return ((timestamp_t) time.tv_sec * 1000000000) + time.tv_nsec; - } - - // Return timestamp in 'ns' - timestamp_t timestamp_ns() const - { - timestamp_t sysclock; - hsa_status_t status = - hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); - CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); - return sysclock_to_ns(sysclock); - } - - // Return time in 'ns' - timestamp_t clocktime_ns(clockid_t clock_id) const - { - timespec time; - clock_gettime(clock_id, &time); - return timespec_to_ns(time); - } - - // Return pair of correlated values of profiling timestamp and time with - // correlation error for a given time ID and number of iterations - void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v, - timestamp_t* time_v, timestamp_t* error_v) - { - clockid_t clock_id = 0; - switch(time_id) - { - case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break; - case TIME_ID_CLOCK_REALTIME_COARSE: clock_id = CLOCK_REALTIME_COARSE; break; - case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break; - case TIME_ID_CLOCK_MONOTONIC_COARSE: clock_id = CLOCK_MONOTONIC_COARSE; break; - case TIME_ID_CLOCK_MONOTONIC_RAW: clock_id = CLOCK_MONOTONIC_RAW; break; - default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); - } - - std::vector ts_vec(iters); - std::vector tm_vec(iters); - const uint32_t steps = iters - 1; - - for(uint32_t i = 0; i < iters; ++i) - { - hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); - clock_gettime(clock_id, &tm_vec[i]); - } - - const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); - const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); - const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); - - timestamp_t ts_accum = 0; - timestamp_t tm_accum = 0; - for(uint32_t i = 0; i < iters; ++i) - { - ts_accum += (ts_vec[i] - ts_base); - tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); - } - - *timestamp_v = (ts_accum / iters) + ts_base + error; - *time_v = (tm_accum / iters) + tm_base; - *error_v = error; - } - -private: - // Timestamp frequency factor - freq_t sysclock_factor_; - // HSA API table - const hsa_pfn_t* const hsa_api_; -}; - -class HsaRsrcFactory -{ -public: - static const size_t CMD_SLOT_SIZE_B = 0x40; - typedef std::recursive_mutex mutex_t; - typedef HsaTimer::timestamp_t timestamp_t; - - static HsaRsrcFactory* Create(bool initialize_hsa = true) - { - std::lock_guard lck(mutex_); - HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); - if(obj == nullptr) - { - obj = new HsaRsrcFactory(initialize_hsa); - instance_.store(obj, std::memory_order_release); - } - return obj; - } - - static HsaRsrcFactory& Instance() - { - HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); - if(obj == nullptr) obj = Create(false); - hsa_status_t status = (obj != nullptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; - CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); - return *obj; - } - - static void Destroy() - { - std::lock_guard lck(mutex_); - if(instance_) delete instance_.load(); - instance_ = nullptr; - } - - // Return system agent info - const AgentInfo* GetAgentInfo(const hsa_agent_t agent); - - // Get the count of Hsa Gpu Agents available on the platform - // @return uint32_t Number of Gpu agents on platform - uint32_t GetCountOfGpuAgents(); - - // Get the count of Hsa Cpu Agents available on the platform - // @return uint32_t Number of Cpu agents on platform - uint32_t GetCountOfCpuAgents(); - - // Get the AgentInfo handle of a Gpu device - // @param idx Gpu Agent at specified index - // @param agent_info Output parameter updated with AgentInfo - // @return bool true if successful, false otherwise - bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); - - // Get the AgentInfo handle of a Cpu device - // @param idx Cpu Agent at specified index - // @param agent_info Output parameter updated with AgentInfo - // @return bool true if successful, false otherwise - bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); - - // Create a Queue object and return its handle. The queue object is expected - // to support user requested number of Aql dispatch packets. - // @param agent_info Gpu Agent on which to create a queue object - // @param num_Pkts Number of packets to be held by queue - // @param queue Output parameter updated with handle of queue object - // @return bool true if successful, false otherwise - bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); - - // Create a Signal object and return its handle. - // @param value Initial value of signal object - // @param signal Output parameter updated with handle of signal object - // @return bool true if successful, false otherwise - bool CreateSignal(uint32_t value, hsa_signal_t* signal); - - // Allocate local GPU memory - // @param agent_info Agent from whose memory region to allocate - // @param size Size of memory in terms of bytes - // @return uint8_t* Pointer to buffer, null if allocation fails. - uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); - - // Allocate memory tp pass kernel parameters - // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo - // parameter. - // @param agent_info Agent from whose memory region to allocate - // @param size Size of memory in terms of bytes - // @return uint8_t* Pointer to buffer, null if allocation fails. - uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); - - // Allocate system memory accessible from both CPU and GPU - // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. - // @param agent_info Agent from whose memory region to allocate - // @param size Size of memory in terms of bytes - // @return uint8_t* Pointer to buffer, null if allocation fails. - uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); - - // Allocate memory for command buffer. - // @param agent_info Agent from whose memory region to allocate - // @param size Size of memory in terms of bytes - // @return uint8_t* Pointer to buffer, null if allocation fails. - uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); - - // Wait signal - hsa_signal_value_t SignalWait(const hsa_signal_t& signal, - const hsa_signal_value_t& signal_value) const; - - // Wait signal with signal value restore - void SignalWaitRestore(const hsa_signal_t& signal, - const hsa_signal_value_t& signal_value) const; - - // Copy data from GPU to host memory - bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); - bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); - - // Memory free method - static bool FreeMemory(void* ptr); - - // Loads an Assembled Brig file and Finalizes it into Device Isa - // @param agent_info Gpu device for which to finalize - // @param brig_path File path of the Assembled Brig file - // @param kernel_name Name of the kernel to finalize - // @param code_desc Handle of finalized Code Descriptor that could - // be used to submit for execution - // @return true if successful, false otherwise - bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, - const char* kernel_name, hsa_executable_t* hsa_exec, - hsa_executable_symbol_t* code_desc); - - // Print the various fields of Hsa Gpu Agents - bool PrintGpuAgents(const std::string& header); - - // Utils for submitting AQL packet to a given queue - static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx); - static void* GetReadPointer(hsa_queue_t* queue); - static uint64_t Submit(hsa_queue_t* queue, const void* packet); - static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); - - // Enable executables loading tracking - static bool IsExecutableTracking() { return executable_tracking_on_; } - static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelNameRef(uint64_t addr); - - // Initialize HSA API table - void static InitHsaApiTable(HsaApiTable* table); - static const hsa_pfn_t* HsaApi() { return &hsa_api_; } - - // Return AqlProfile API table - typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; - const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } - - // Return Loader API table - const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } - - // Methods for system-clock/ns conversion and timestamp in 'ns' - timestamp_t SysclockToNs(const timestamp_t& sysclock) const - { - return timer_->sysclock_to_ns(sysclock); - } - timestamp_t NsToSysclock(const timestamp_t& time) const - { - return timer_->ns_to_sysclock(time); - } - timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } - - timestamp_t GetSysTimeout() const { return timeout_; } - static timestamp_t GetTimeoutNs() { return timeout_ns_; } - static void SetTimeoutNs(const timestamp_t& time) - { - std::lock_guard lck(mutex_); - timeout_ns_ = time; - if(instance_ != nullptr) - Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); - } - - void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) - { - timestamp_t timestamp_v = 0; - timestamp_t time_v = 0; - timestamp_t error_v = 0; - timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); - time_shift_[time_id] = time_v - timestamp_v; - time_error_[time_id] = error_v; - } - - hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) - { - if(time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; - *time_value = time_stamp + time_shift_[time_id]; - return HSA_STATUS_SUCCESS; - } - - hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) - { - *err = time_error_[time_id]; - return HSA_STATUS_SUCCESS; - } - -private: - // System agents iterating callback - static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); - - // Callback function to find and bind kernarg region of an agent - static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); - - // Load AQL profile HSA extension library directly - static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); - - // Constructor of the class. Will initialize the Hsa Runtime and - // query the system topology to get the list of Cpu and Gpu devices - explicit HsaRsrcFactory(bool initialize_hsa); - - // Destructor of the class - ~HsaRsrcFactory(); - - // Add an instance of AgentInfo representing a Hsa Gpu agent - const AgentInfo* AddAgentInfo(const hsa_agent_t agent); - - // To mmap command buffer memory - static const bool CMD_MEMORY_MMAP = false; - - // HSA was initialized - const bool initialize_hsa_; - - static std::atomic instance_; - static mutex_t mutex_; - - // Used to maintain a list of Hsa Gpu Agent Info - std::vector gpu_list_; - std::vector gpu_agents_; - - // Used to maintain a list of Hsa Cpu Agent Info - std::vector cpu_list_; - std::vector cpu_agents_; - - // System agents map - std::map agent_map_; - - // Executables loading tracking - typedef std::map symbols_map_t; - static symbols_map_t* symbols_map_; - static bool executable_tracking_on_; - static void* to_dump_code_obj_; - static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, - const char* options); - static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); - static hsa_status_t executable_symbols_cb(hsa_executable_t exec, - hsa_executable_symbol_t symbol, void* data); - - // HSA runtime API table - static hsa_pfn_t hsa_api_; - - // AqlProfile API table - aqlprofile_pfn_t aqlprofile_api_; - - // Loader API table - hsa_ven_amd_loader_1_00_pfn_t loader_api_; - - // System timeout, ns - static timestamp_t timeout_ns_; - // System timeout, sysclock - timestamp_t timeout_; - - // HSA timer - HsaTimer* timer_; - - // Time shift array to support time conversion - timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; - timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; - - // CPU/kern-arg memory pools - hsa_amd_memory_pool_t* cpu_pool_; - hsa_amd_memory_pool_t* kern_arg_pool_; -}; - -} // namespace util -} // namespace rocprofiler diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm_smi.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm_smi.hpp index 09d67e7517..ef1b3d4302 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm_smi.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm_smi.hpp @@ -128,7 +128,8 @@ private: static bool shutdown(); }; -#if !defined(ROCPROFSYS_USE_ROCM_SMI) +#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0 + inline void setup() {} @@ -154,7 +155,7 @@ inline void set_state(State) {} } // namespace rocm_smi } // namespace rocprofsys -#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0 +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 # if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \ (defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp new file mode 100644 index 0000000000..cb1d5627a4 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -0,0 +1,1308 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/rocprofiler-sdk.hpp" +#include "api.hpp" +#include "common/synchronized.hpp" +#include "core/config.hpp" +#include "core/containers/stable_vector.hpp" +#include "core/debug.hpp" +#include "core/gpu.hpp" +#include "core/perfetto.hpp" +#include "core/rocprofiler-sdk.hpp" +#include "core/state.hpp" +#include "library/components/category_region.hpp" +#include "library/rocm_smi.hpp" +#include "library/rocprofiler-sdk/counters.hpp" +#include "library/rocprofiler-sdk/fwd.hpp" +#include "library/thread_info.hpp" +#include "library/tracing.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +using tool_agent_vec_t = std::vector; +client_data* tool_data = new client_data{}; + +void +thread_precreate(rocprofiler_runtime_library_t /*lib*/, void* /*tool_data*/) +{ + push_thread_state(ThreadState::Internal); +} + +void +thread_postcreate(rocprofiler_runtime_library_t /*lib*/, void* /*tool_data*/) +{ + pop_thread_state(); +} + +// this function creates a rocprofiler profile config on the first entry +std::vector +create_agent_profile(rocprofiler_agent_id_t agent_id, + const std::vector& counters, + // const tool_agent_vec_t& gpu_agents, + // const agent_counter_info_map_t& counters_info, + // agent_counter_profile_map_t& data) + client_data* data = tool_data) +{ + using counter_vec_t = std::vector; + + // check if already created + if(data->agent_counter_profiles.find(agent_id) != data->agent_counter_profiles.end()) + return counter_vec_t{}; + + auto profile = std::optional{}; + auto expected_v = counters.size(); + auto found_v = std::vector{}; + auto counters_v = counter_vec_t{}; + const auto* tool_agent_v = data->get_gpu_tool_agent(agent_id); + + constexpr auto device_qualifier = std::string_view{ ":device=" }; + for(const auto& itr : counters) + { + auto name_v = itr; + if(auto pos = std::string::npos; + (pos = itr.find(device_qualifier)) != std::string::npos) + { + name_v = itr.substr(0, pos); + auto dev_id_s = itr.substr(pos + device_qualifier.length()); + + ROCPROFSYS_CONDITIONAL_ABORT(dev_id_s.empty() || + dev_id_s.find_first_not_of("0123456789") != + std::string::npos, + "invalid device qualifier format (':device=N) " + "where N is the GPU id: %s\n", + itr.c_str()); + + auto dev_id_v = std::stoul(dev_id_s); + + ROCPROFSYS_PRINT_F("tool agent device id=%lu, name=%s, device_id=%lu\n", + tool_agent_v->device_id, name_v.c_str(), dev_id_v); + // skip this counter if the counter is for a specific device id (which + // doesn't this agent's device id) + if(dev_id_v != tool_agent_v->device_id) + { + --expected_v; // is not expected + continue; + } + } + + auto _old_name_v = name_v; + name_v = + std::regex_replace(name_v, std::regex{ "^(.*)(\\[)([0-9]+)(\\])$" }, "$1"); + if(name_v != _old_name_v) + ROCPROFSYS_PRINT_F("tool agent device id=%lu, old_name=%s, name=%s\n", + tool_agent_v->device_id, _old_name_v.c_str(), + name_v.c_str()); + + // search the gpu agent counter info for a counter with a matching name + for(const auto& citr : data->agent_counter_info.at(agent_id)) + { + if(name_v == std::string_view{ citr.name }) + { + counters_v.emplace_back(citr.id); + found_v.emplace_back(itr); + } + } + } + + if(counters_v.size() != expected_v) + { + auto requested_counters = + timemory::join::join(timemory::join::array_config{ ", ", "", "" }, counters); + auto found_counters = + timemory::join::join(timemory::join::array_config{ ", ", "", "" }, found_v); + + ROCPROFSYS_ABORT_F( + "Unable to find all counters for agent %i (gpu-%li, %s) in %s. Found: %s\n", + tool_agent_v->agent->node_id, tool_agent_v->device_id, + tool_agent_v->agent->name, requested_counters.c_str(), + found_counters.c_str()); + } + + if(!counters_v.empty()) + { + auto profile_v = rocprofiler_profile_config_id_t{}; + ROCPROFILER_CALL(rocprofiler_create_profile_config( + agent_id, counters_v.data(), counters_v.size(), &profile_v)); + profile = profile_v; + } + + data->agent_counter_profiles.emplace(agent_id, profile); + + return counters_v; +} + +const kernel_symbol_data_t* +get_kernel_symbol_info(uint64_t _kernel_id) +{ + return tool_data->get_kernel_symbol_info(_kernel_id); +} + +// Implementation of rocprofiler_callback_tracing_operation_args_cb_t +int +save_args(rocprofiler_callback_tracing_kind_t /*kind*/, int32_t /*operation*/, + uint32_t /*arg_number*/, const void* const /*arg_value_addr*/, + int32_t /*arg_indirection_count*/, const char* /*arg_type*/, + const char* arg_name, const char* arg_value_str, + int32_t /*arg_dereference_count*/, void* data) +{ + auto* argvec = static_cast(data); + argvec->emplace_back(arg_name, arg_value_str); + return 0; +} + +auto& +get_marker_pushed_ranges() +{ + static thread_local auto _v = std::vector{}; + return _v; +} + +auto& +get_marker_started_ranges() +{ + static thread_local auto _v = std::vector{}; + return _v; +} + +template +void +tool_tracing_callback_start(CategoryT, rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /*user_data*/, + rocprofiler_timestamp_t /*ts*/) +{ + auto _name = tool_data->callback_tracing_info.at(record.kind, record.operation); + + if constexpr(std::is_same::value) + { + if(record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API) + { + auto* _data = static_cast( + record.payload); + + switch(record.operation) + { + case ROCPROFILER_MARKER_CORE_API_ID_roctxRangePushA: + { + _name = _data->args.roctxRangePushA.message; + auto _hash = tim::add_hash_id(_name); + get_marker_pushed_ranges().emplace_back(_hash); + break; + } + case ROCPROFILER_MARKER_CORE_API_ID_roctxRangeStartA: + { + _name = _data->args.roctxRangeStartA.message; + auto _hash = tim::add_hash_id(_name); + get_marker_started_ranges().emplace_back(_hash); + break; + } + case ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA: + { + _name = _data->args.roctxMarkA.message; + tim::add_hash_id(_name); + break; + } + default: + { + break; + } + } + } + } + + if(get_use_timemory()) + { + component::category_region::start( + _name); + } +} + +template +void +tool_tracing_callback_stop( + CategoryT, rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, rocprofiler_timestamp_t ts, + std::optional>& _bt_data) +{ + auto _name = tool_data->callback_tracing_info.at(record.kind, record.operation); + + if constexpr(std::is_same::value) + { + if(record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API) + { + auto* _data = static_cast( + record.payload); + + switch(record.operation) + { + case ROCPROFILER_MARKER_CORE_API_ID_roctxRangePop: + { + ROCPROFSYS_CONDITIONAL_ABORT_F( + get_marker_pushed_ranges().empty(), + "roctxRangePop does not have corresponding roctxRangePush on " + "this thread"); + + auto _hash = get_marker_pushed_ranges().back(); + _name = tim::get_hash_identifier_fast(_hash); + get_marker_pushed_ranges().pop_back(); + break; + } + case ROCPROFILER_MARKER_CORE_API_ID_roctxRangeStop: + { + ROCPROFSYS_CONDITIONAL_ABORT_F( + get_marker_started_ranges().empty(), + "roctxRangeStop does not have corresponding roctxRangeStart on " + "this thread"); + + auto _hash = get_marker_started_ranges().back(); + _name = tim::get_hash_identifier_fast(_hash); + get_marker_started_ranges().pop_back(); + break; + } + case ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA: + { + _name = _data->args.roctxMarkA.message; + break; + } + default: + { + break; + } + } + } + } + + if(get_use_timemory()) + { + component::category_region::stop( + _name); + } + + if(get_use_perfetto()) + { + auto args = callback_arg_array_t{}; + if(config::get_perfetto_annotations()) + { + rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, 2, + &args); + } + + uint64_t _beg_ts = user_data->value; + uint64_t _end_ts = ts; + + tracing::push_perfetto_ts( + CategoryT{}, _name.data(), _beg_ts, + ::perfetto::Flow::ProcessScoped(record.correlation_id.internal), + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ts); + + for(const auto& [key, val] : args) + tracing::add_perfetto_annotation(ctx, key, val); + + if(_bt_data && !_bt_data->empty()) + { + const std::string _unk = "??"; + size_t _bt_cnt = 0; + for(const auto& itr : *_bt_data) + { + auto _linfo = itr.lineinfo.get(); + const auto* _func = (itr.name.empty()) ? &_unk : &itr.name; + const auto* _loc = + (_linfo && !_linfo.location.empty()) + ? &_linfo.location + : ((itr.location.empty()) ? &_unk : &itr.location); + auto _line = (_linfo && _linfo.line > 0) + ? join("", _linfo.line) + : ((itr.lineno == 0) ? std::string{ "?" } + : join("", itr.lineno)); + auto _entry = + join("", demangle(*_func), " @ ", + join(':', ::basename(_loc->c_str()), _line)); + if(_bt_cnt < 10) + { + // Prepend zero for better ordering in UI. Only one zero + // is ever necessary since stack depth is limited to 16. + tracing::add_perfetto_annotation( + ctx, join("", "frame#0", _bt_cnt++), _entry); + } + else + { + tracing::add_perfetto_annotation( + ctx, join("", "frame#", _bt_cnt++), _entry); + } + } + } + } + }); + tracing::pop_perfetto_ts( + CategoryT{}, _name.data(), _end_ts, [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ts); + }); + } +} + +void +tool_control_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /*user_data*/, void* /*callback_data*/) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API) + { + if(record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerPause && + record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + stop(); + } + else if(record.operation == + ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerResume && + record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + start(); + } + } +} + +void +tool_code_object_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /*user_data*/, void* /*callback_data*/) +{ + auto ts = rocprofiler_timestamp_t{}; + ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts)); + + if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) + { + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + if(record.operation == ROCPROFILER_CODE_OBJECT_LOAD) + { + auto data_v = + *static_cast( + record.payload); + tool_data->code_object_records.wlock([ts, &record, &data_v](auto& _data) { + _data.emplace_back( + code_object_callback_record_t{ ts, record, data_v }); + }); + } + else if(record.operation == + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + auto data_v = *static_cast(record.payload); + tool_data->kernel_symbol_records.wlock( + [ts, &record, &data_v](auto& _data) { + _data.emplace_back( + new kernel_symbol_callback_record_t{ ts, record, data_v }); + }); + } + } + return; + } +} + +auto& +get_kernel_dispatch_timestamps() +{ + static auto _v = std::unordered_map{}; + return _v; +} + +void +tool_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, void* /*callback_data*/) +{ + auto ts = rocprofiler_timestamp_t{}; + ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts)); + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + user_data->value = ts; + + switch(record.kind) + { + case ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API: + { + tool_tracing_callback_start(category::rocm_hsa_api{}, record, user_data, + ts); + break; + } + case ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API: + case ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API: + { + tool_tracing_callback_start(category::rocm_hip_api{}, record, user_data, + ts); + break; + } + case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API: + { + tool_tracing_callback_start(category::rocm_marker_api{}, record, + user_data, ts); + break; + } + case ROCPROFILER_CALLBACK_TRACING_NONE: + case ROCPROFILER_CALLBACK_TRACING_LAST: + case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: + case ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API: + case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: + case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + ROCPROFSYS_CI_ABORT(true, "unhandled callback record kind: %i\n", + record.kind); + break; + } + } + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + using backtrace_entry_vec_t = std::vector; + + constexpr size_t bt_stack_depth = 16; + constexpr size_t bt_ignore_depth = 3; + constexpr bool bt_with_signal_frame = true; + + auto _bt_data = std::optional{}; + if(config::get_use_perfetto() && config::get_perfetto_annotations() && + tool_data->backtrace_operations.at(record.kind).count(record.operation) > 0) + { + auto _backtrace = tim::get_unw_stack(); + _bt_data = backtrace_entry_vec_t{}; + _bt_data->reserve(_backtrace.size()); + for(auto itr : _backtrace) + { + if(itr) + { + if(auto _val = binary::lookup_ipaddr_entry(itr->address()); + _val) + { + _bt_data->emplace_back(std::move(*_val)); + } + } + } + } + + switch(record.kind) + { + case ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API: + case ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API: + { + tool_tracing_callback_stop(category::rocm_hsa_api{}, record, user_data, + ts, _bt_data); + break; + } + case ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API: + case ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API: + { + tool_tracing_callback_stop(category::rocm_hip_api{}, record, user_data, + ts, _bt_data); + break; + } + case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API: + { + tool_tracing_callback_stop(category::rocm_marker_api{}, record, user_data, + ts, _bt_data); + break; + } + case ROCPROFILER_CALLBACK_TRACING_NONE: + case ROCPROFILER_CALLBACK_TRACING_LAST: + case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: + case ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API: + case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: + case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + ROCPROFSYS_CI_ABORT(true, "unhandled callback record kind: %i\n", + record.kind); + break; + } + } + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_NONE) + { + if(record.kind == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH && + record.operation == ROCPROFILER_KERNEL_DISPATCH_COMPLETE) + { + auto* _data = + static_cast( + record.payload); + + // save for post-processing + get_kernel_dispatch_timestamps().emplace( + _data->dispatch_info.dispatch_id, + timing_interval{ _data->start_timestamp, _data->end_timestamp }); + } + } + else + { + ROCPROFSYS_CI_ABORT(true, "unhandled callback record phase: %i\n", record.phase); + } +} + +using kernel_dispatch_bundle_t = tim::lightweight_tuple; + +void +tool_tracing_buffered(rocprofiler_context_id_t /*context*/, + rocprofiler_buffer_id_t /*buffer_id*/, + rocprofiler_record_header_t** headers, size_t num_headers, + void* /*user_data*/, uint64_t /*drop_count*/) +{ + if(num_headers == 0 || headers == nullptr) return; + + for(size_t i = 0; i < num_headers; ++i) + { + auto* header = headers[i]; + + if(ROCPROFSYS_LIKELY(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING)) + { + if(header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) + { + auto* record = + static_cast( + header->payload); + + const auto* _kern_sym_data = + get_kernel_symbol_info(record->dispatch_info.kernel_id); + + auto _name = tim::demangle(_kern_sym_data->kernel_name); + auto _corr_id = record->correlation_id.internal; + auto _beg_ns = record->start_timestamp; + auto _end_ns = record->end_timestamp; + auto _agent_id = record->dispatch_info.agent_id; + auto _queue_id = record->dispatch_info.queue_id; + const auto* _agent = tool_data->get_gpu_tool_agent(_agent_id); + + if(get_use_timemory()) + { + const auto& _tinfo = thread_info::get(record->thread_id, SystemTID); + auto _tid = _tinfo->index_data->sequent_value; + + auto _bundle = kernel_dispatch_bundle_t{ _name }; + + _bundle.push(_tid).start().stop(); + _bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) { + _wc->set_value(_end_ns - _beg_ns); + _wc->set_accum(_end_ns - _beg_ns); + }); + _bundle.pop(); + } + + if(get_use_perfetto()) + { + auto _track_desc = [](int32_t _device_id_v, int64_t _queue_id_v) { + return JOIN("", "GPU Kernel Dispatch [", _device_id_v, "] Queue ", + _queue_id_v); + }; + + const auto _track = tracing::get_perfetto_track( + category::rocm_kernel_dispatch{}, _track_desc, _agent->device_id, + _queue_id.handle); + + tracing::push_perfetto( + category::rocm_kernel_dispatch{}, _name.c_str(), _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", + _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", + _corr_id); + tracing::add_perfetto_annotation( + ctx, "node_id", _agent->agent->logical_node_id); + tracing::add_perfetto_annotation(ctx, "queue", + _queue_id.handle); + tracing::add_perfetto_annotation( + ctx, "dispatch_id", + record->dispatch_info.dispatch_id); + tracing::add_perfetto_annotation( + ctx, "kernel_id", record->dispatch_info.kernel_id); + tracing::add_perfetto_annotation( + ctx, "private_segment_size", + record->dispatch_info.private_segment_size); + tracing::add_perfetto_annotation( + ctx, "group_segment_size", + record->dispatch_info.group_segment_size); + tracing::add_perfetto_annotation( + ctx, "workgroup_size", + JOIN("", "(", + JOIN(',', record->dispatch_info.workgroup_size.x, + record->dispatch_info.workgroup_size.y, + record->dispatch_info.workgroup_size.z), + ")")); + tracing::add_perfetto_annotation( + ctx, "grid_size", + JOIN("", "(", + JOIN(',', record->dispatch_info.grid_size.x, + record->dispatch_info.grid_size.y, + record->dispatch_info.grid_size.z), + ")")); + } + }); + tracing::pop_perfetto(category::rocm_kernel_dispatch{}, _name.c_str(), + _track, _end_ns); + } + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) + { + auto* record = + static_cast( + header->payload); + + auto _corr_id = record->correlation_id.internal; + auto _beg_ns = record->start_timestamp; + auto _end_ns = record->end_timestamp; + auto _dst_agent_id = record->dst_agent_id; + auto _src_agent_id = record->src_agent_id; + const auto* _dst_agent = tool_data->get_agent(_dst_agent_id); + const auto* _src_agent = tool_data->get_agent(_src_agent_id); + auto _name = + tool_data->buffered_tracing_info.at(record->kind, record->operation); + + if(get_use_timemory()) + { + const auto& _tinfo = thread_info::get(record->thread_id, SystemTID); + auto _tid = _tinfo->index_data->sequent_value; + + auto _bundle = kernel_dispatch_bundle_t{ _name }; + + _bundle.push(_tid).start().stop(); + _bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) { + _wc->set_value(_end_ns - _beg_ns); + _wc->set_accum(_end_ns - _beg_ns); + }); + _bundle.pop(); + } + + if(get_use_perfetto()) + { + auto _track_desc = [](int32_t _device_id_v, + rocprofiler_thread_id_t _tid) { + const auto& _tid_v = thread_info::get(_tid, SystemTID); + return JOIN("", "GPU Memory Copy to Agent [", _device_id_v, + "] Thread ", _tid_v->index_data->sequent_value); + }; + + const auto _track = tracing::get_perfetto_track( + category::rocm_memory_copy{}, _track_desc, + _dst_agent->logical_node_id, record->thread_id); + + tracing::push_perfetto( + category::rocm_memory_copy{}, _name.data(), _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", + _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", + _corr_id); + tracing::add_perfetto_annotation( + ctx, "dst_agent", _dst_agent->logical_node_id); + tracing::add_perfetto_annotation( + ctx, "src_agent", _src_agent->logical_node_id); + } + }); + tracing::pop_perfetto(category::rocm_memory_copy{}, "", _track, + _end_ns); + } + } + else + { + ROCPROFSYS_THROW( + "unexpected rocprofiler_record_header_t buffer tracing category " + "kind. category: %i, kind: %i\n", + header->category, header->kind); + } + } + else + { + ROCPROFSYS_THROW("unexpected rocprofiler_record_header_t tracing category " + "kind. category: %i, kind: %i\n", + header->category, header->kind); + } + } +} + +auto& +get_counter_dispatch_data() +{ + static auto _v = + container::stable_vector{}; + return _v; +} + +auto& +get_counter_dispatch_records() +{ + static auto _v = std::vector{}; + return _v; +} + +using counter_storage_map_t = + std::unordered_map; +using agent_counter_storage_map_t = + std::unordered_map; + +auto*& +get_counter_storage() +{ + static auto* _v = new agent_counter_storage_map_t{}; + return _v; +} + +void +counter_record_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, size_t record_count, + rocprofiler_user_data_t /*user_data*/, + void* /*callback_data_arg*/) +{ + auto* _agent_counter_storage = get_counter_storage(); + if(!_agent_counter_storage) return; + + static auto _mtx = std::mutex{}; + auto _lk = std::unique_lock{ _mtx }; + + auto _dispatch_id = dispatch_data.dispatch_info.dispatch_id; + auto _agent_id = dispatch_data.dispatch_info.agent_id; + auto _scope = scope::get_default(); + auto _interval = timing_interval{}; + auto _aggregate = + std::unordered_map{}; + for(size_t i = 0; i < record_count; ++i) + { + auto _counter_id = rocprofiler_counter_id_t{}; + ROCPROFILER_CALL( + rocprofiler_query_record_counter_id(record_data[i].id, &_counter_id)); + + if(!_aggregate.emplace(_counter_id, record_data[i]).second) + { + _aggregate[_counter_id].counter_value += record_data[i].counter_value; + } + } + + if(_agent_counter_storage->count(_agent_id) == 0) + _agent_counter_storage->emplace(_agent_id, counter_storage_map_t{}); + + if(get_kernel_dispatch_timestamps().count(_dispatch_id) > 0) + { + _interval = get_kernel_dispatch_timestamps().at(_dispatch_id); + get_kernel_dispatch_timestamps().erase(_dispatch_id); + } + + for(const auto& itr : _aggregate) + { + if(_agent_counter_storage->at(_agent_id).count(itr.first) == 0) + { + const auto* _agent = tool_data->get_gpu_tool_agent(_agent_id); + const auto* _info = tool_data->get_tool_counter_info(_agent_id, itr.first); + + ROCPROFSYS_CONDITIONAL_ABORT_F( + !_agent, "unable to find tool agent for agent (id=%zu)\n", + _agent_id.handle); + ROCPROFSYS_CONDITIONAL_ABORT_F( + !_info, + "unable to find counter info for counter (id=%zu) on agent (id=%zu)\n", + itr.first.handle, _agent_id.handle); + + auto _dev_id = static_cast(_agent->device_id); + + _agent_counter_storage->at(_agent_id).emplace( + itr.first, counter_storage{ tool_data, _dev_id, 0, _info->name }); + } + + auto _event = counter_event{ counter_dispatch_record{ + &dispatch_data, _dispatch_id, itr.first, itr.second } }; + + _agent_counter_storage->at(_agent_id).at(itr.first)(_event, _interval, _scope); + } +} + +void +dispatch_counting_service_callback( + rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, rocprofiler_user_data_t* /*user_data*/, + void* callback_data_arg) +{ + auto* _data = as_client_data(callback_data_arg); + if(!_data || !config) return; + + if(auto itr = + _data->agent_counter_profiles.find(dispatch_data.dispatch_info.agent_id); + itr != _data->agent_counter_profiles.end() && itr->second) + { + *config = *itr->second; + } +} + +// int +// external_correlation_id_callback( +// rocprofiler_thread_id_t /*thr_id*/, rocprofiler_context_id_t /*ctx_id*/, +// rocprofiler_external_correlation_id_request_kind_t /*kind*/, +// rocprofiler_tracing_operation_t /*op*/, uint64_t /*internal_corr_id*/, +// rocprofiler_user_data_t* external_corr_id, void* /*user_data*/) +// { +// auto* _data = new kernel_dispatch_bundle_t{ "kernel_dispatch" }; +// _data->push(); +// external_corr_id->ptr = _data; +// return 0; +// } + +// void +// agent_counter_profile_callback(rocprofiler_context_id_t context_id, +// rocprofiler_agent_id_t agent, +// rocprofiler_agent_set_profile_callback_t set_config, void*) +// { +// if(!agent_counter_profiles) return; +// if(auto itr = agent_counter_profiles->find(agent); +// itr != agent_counter_profiles->end() && itr->second) +// set_config(context_id, *itr->second); +// } + +bool +is_initialized(rocprofiler_context_id_t ctx) +{ + return (ctx.handle > 0); +} + +bool +is_active(rocprofiler_context_id_t ctx) +{ + int status = 0; + auto errc = rocprofiler_context_is_active(ctx, &status); + return (errc == ROCPROFILER_STATUS_SUCCESS && status > 0); +} + +bool +is_valid(rocprofiler_context_id_t ctx) +{ + int status = 0; + auto errc = rocprofiler_context_is_valid(ctx, &status); + return (errc == ROCPROFILER_STATUS_SUCCESS && status > 0); +} + +void +flush() +{ + if(!tool_data) return; + + for(auto itr : tool_data->get_buffers()) + { + if(itr.handle > 0) + { + auto status = rocprofiler_flush_buffer(itr); + if(status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) + { + ROCPROFILER_CALL(status); + } + } + } +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) +{ + auto domains = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS"); + + ROCPROFSYS_VERBOSE_F(1, "rocprof-sys ROCm Domains:\n"); + for(const auto& itr : domains->get_choices()) + ROCPROFSYS_VERBOSE_F(1, "- %s\n", itr.c_str()); + + auto _callback_domains = rocprofiler_sdk::get_callback_domains(); + auto _buffered_domain = rocprofiler_sdk::get_buffered_domains(); + auto _counter_events = rocprofiler_sdk::get_rocm_events(); + + auto* _data = as_client_data(user_data); + _data->client_fini = fini_func; + + _data->initialize(); + if(!_counter_events.empty()) _data->initialize_event_info(); + + ROCPROFILER_CALL(rocprofiler_create_context(&_data->primary_ctx)); + + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + _data->primary_ctx, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, nullptr, 0, + tool_code_object_callback, _data)); + + for(auto itr : { ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API }) + { + if(_callback_domains.count(itr) > 0) + { + auto _ops = rocprofiler_sdk::get_operations(itr); + _data->backtrace_operations.emplace( + itr, rocprofiler_sdk::get_backtrace_operations(itr)); + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + _data->primary_ctx, itr, _ops.data(), _ops.size(), tool_tracing_callback, + _data)); + } + } + + constexpr auto buffer_size = 8192; + constexpr auto watermark = 7936; + + if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) > 0) + { + ROCPROFILER_CALL(rocprofiler_create_buffer( + _data->primary_ctx, buffer_size, watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, + &_data->kernel_dispatch_buffer)); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, + _data->kernel_dispatch_buffer)); + + // auto external_corr_id_request_kinds = + // std::array{ + // ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH + // }; + + // ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service( + // _data->primary_ctx, external_corr_id_request_kinds.data(), + // external_corr_id_request_kinds.size(), external_correlation_id_callback, + // _data)); + } + + if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) > 0) + { + ROCPROFILER_CALL(rocprofiler_create_buffer( + _data->primary_ctx, buffer_size, watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, + &_data->memory_copy_buffer)); + + auto _ops = + rocprofiler_sdk::get_operations(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + (_ops.empty()) ? nullptr : _ops.data(), _ops.size(), + _data->memory_copy_buffer)); + } + + if(!_counter_events.empty()) + { + for(const auto& itr : _data->gpu_agents) + { + _data->agent_events.emplace( + itr.agent->id, + create_agent_profile(itr.agent->id, _counter_events, _data)); + } + + ROCPROFILER_CALL(rocprofiler_create_context(&_data->counter_ctx)); + + auto _operations = std::array{ + ROCPROFILER_KERNEL_DISPATCH_COMPLETE + }; + + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + _data->counter_ctx, ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, + _operations.data(), _operations.size(), tool_tracing_callback, _data)); + + ROCPROFILER_CALL(rocprofiler_configure_callback_dispatch_counting_service( + _data->counter_ctx, dispatch_counting_service_callback, _data, + counter_record_callback, _data)); + + // ROCPROFILER_CALL(rocprofiler_create_buffer( + // counter_ctx, buffer_size, watermark, + // ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, + // &counter_collection_buffer)); + + // for(const auto& itr : *agent_counter_profiles) + // { + // ROCPROFILER_CALL(rocprofiler_configure_agent_profile_counting_service( + // counter_ctx, counter_collection_buffer, itr.first, + // agent_counter_profile_callback, nullptr)); + // } + } + + for(const auto& itr : _data->get_buffers()) + { + if(itr.handle > 0) + { + auto client_thread = rocprofiler_callback_thread_t{}; + ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread)); + ROCPROFILER_CALL(rocprofiler_assign_callback_thread(itr, client_thread)); + } + } + + // throwaway context for handling the profiler control API. If primary_ctx were used, + // we would get profiler pause callback but never get profiler resume callback + { + auto _local_ctx = rocprofiler_context_id_t{ 0 }; + ROCPROFILER_CALL(rocprofiler_create_context(&_local_ctx)); + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + _local_ctx, ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, nullptr, 0, + tool_control_callback, _data)); + } + + if(!is_valid(_data->primary_ctx)) + { + // notify rocprofiler that initialization failed and all the contexts, buffers, + // etc. created should be ignored + return -1; + } + + gpu::add_device_metadata(); + + if(config::get_use_process_sampling() && config::get_use_rocm_smi()) + { + ROCPROFSYS_VERBOSE_F(1, "Setting rocm_smi state to active...\n"); + rocm_smi::set_state(State::Active); + } + + start(); + + // no errors + return 0; +} + +void +tool_fini(void* callback_data) +{ + static std::atomic_flag _once = ATOMIC_FLAG_INIT; + if(_once.test_and_set()) return; + + flush(); + stop(); + + if(config::get_use_process_sampling() && config::get_use_rocm_smi()) + rocm_smi::shutdown(); + + if(get_counter_storage()) + { + auto _storages = std::vector{}; + for(const auto& citr : *get_counter_storage()) + { + for(const auto& itr : citr.second) + _storages.emplace_back(&itr.second); + } + + std::sort(_storages.begin(), _storages.end(), + [](const counter_storage* lhs, const counter_storage* rhs) { + return *lhs < *rhs; + }); + + for(const auto* itr : _storages) + itr->write(); + _storages.clear(); + + get_counter_storage()->clear(); + delete get_counter_storage(); + get_counter_storage() = nullptr; + } + + auto* _data = as_client_data(callback_data); + _data->client_id = nullptr; + _data->client_fini = nullptr; + + delete tool_data; + tool_data = nullptr; +} +} // namespace + +void +setup() +{ + if(int status = 0; + rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) + { + ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure)); + } +} + +void +shutdown() +{ + // shutdown + if(tool_data && tool_data->client_id && tool_data->client_fini) + tool_data->client_fini(*tool_data->client_id); +} + +void +config() +{} + +void +post_process() +{} + +void +sample() +{} + +void +start() +{ + if(!tool_data) return; + + for(auto itr : tool_data->get_contexts()) + { + if(is_initialized(itr) && !is_active(itr)) + { + ROCPROFILER_CALL(rocprofiler_start_context(itr)); + } + } +} + +void +stop() +{ + if(!tool_data) return; + + for(auto itr : tool_data->get_contexts()) + { + if(is_initialized(itr) && is_active(itr)) + { + ROCPROFILER_CALL(rocprofiler_stop_context(itr)); + } + } +} + +std::vector +get_rocm_events_info() +{ + if(!tool_data) + { + auto _tool_data_v = client_data{}; + _tool_data_v.initialize_event_info(); + return _tool_data_v.events_info; + } + + if(tool_data->events_info.empty()) tool_data->initialize_event_info(); + + return tool_data->events_info; +} +} // namespace rocprofiler_sdk +} // namespace rocprofsys + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, const char* runtime_version, uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate once + { + static bool _first = true; + if(!_first) return nullptr; + _first = false; + } + + if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) return nullptr; + if(!tim::settings::enabled()) return nullptr; + + if(!rocprofsys::config::settings_are_configured() && + rocprofsys::get_state() < rocprofsys::State::Active) + rocprofsys_init_tooling_hidden(); + + // set the client name + id->name = "rocprofsys"; + + // ensure tool data exists + if(!rocprofsys::rocprofiler_sdk::tool_data) + rocprofsys::rocprofiler_sdk::tool_data = + new rocprofsys::rocprofiler_sdk::client_data{}; + + // store client info + rocprofsys::rocprofiler_sdk::tool_data->client_id = id; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." + << patch << " (" << runtime_version << ")"; + + ROCPROFSYS_VERBOSE_F(0, "%s\n", info.str().c_str()); + ROCPROFSYS_VERBOSE_F(2, "client_id=%u, priority=%u\n", id->handle, priority); + + ROCPROFILER_CALL(rocprofiler_at_internal_thread_create( + rocprofsys::rocprofiler_sdk::thread_precreate, + rocprofsys::rocprofiler_sdk::thread_postcreate, + ROCPROFILER_LIBRARY | ROCPROFILER_HSA_LIBRARY | ROCPROFILER_HIP_LIBRARY | + ROCPROFILER_MARKER_LIBRARY, + nullptr)); + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{ sizeof(rocprofiler_tool_configure_result_t), + &::rocprofsys::rocprofiler_sdk::tool_init, + &::rocprofsys::rocprofiler_sdk::tool_fini, + rocprofsys::rocprofiler_sdk::tool_data }; + + // return pointer to configure data + return &cfg; +} diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.hpp similarity index 54% rename from projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.hpp rename to projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.hpp index a87784c8b0..4853e4d420 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,67 +22,39 @@ #pragma once -#include "core/defines.hpp" #include "core/timemory.hpp" -#include "library/components/rocprofiler.hpp" -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include namespace rocprofsys { -namespace rocprofiler +namespace rocprofiler_sdk { -std::map> -get_data_labels(); +using hardware_counter_info = ::tim::hardware_counters::info; void -rocm_initialize(); +setup(); void -rocm_cleanup(); +shutdown(); -bool& -is_setup(); +void +config(); void post_process(); -std::vector -rocm_metrics(); +void +sample(); -#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0 -inline void -post_process() -{} +void +start(); -inline void -rocm_cleanup() -{} +void +stop(); -inline std::vector -rocm_metrics() -{ - return std::vector{}; -} -#endif - -} // namespace rocprofiler +std::vector +get_rocm_events_info(); +} // namespace rocprofiler_sdk } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt new file mode 100644 index 0000000000..97446e34c4 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt @@ -0,0 +1,9 @@ +# +set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp + ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp) + +set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp + ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp) + +target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources} + ${rocprofiler_sdk_headers}) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.cpp new file mode 100644 index 0000000000..fad295f406 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.cpp @@ -0,0 +1,135 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/rocprofiler-sdk/counters.hpp" +#include "common/synchronized.hpp" +#include "core/debug.hpp" +#include "core/timemory.hpp" +#include "library/rocprofiler-sdk/fwd.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +std::string +get_counter_description(const client_data* tool_data, std::string_view _v) +{ + const auto& _info = tool_data->events_info; + for(const auto& itr : _info) + { + if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0) + { + return itr.long_description(); + } + } + return std::string{}; +} +} // namespace + +void +counter_event::operator()(const client_data* tool_data, ::perfetto::CounterTrack* _track, + timing_interval _timing, scope::config _scope) const +{ + if(!record.dispatch_data) return; + + const auto& _dispatch_info = record.dispatch_data->dispatch_info; + const auto* _kern_sym_data = + tool_data->get_kernel_symbol_info(_dispatch_info.kernel_id); + + auto _bundle = counter_bundle_t{ tim::demangle(_kern_sym_data->kernel_name), _scope }; + + _bundle.push(_dispatch_info.queue_id.handle) + .start() + .store(record.record_counter.counter_value); + + _bundle.stop().pop(_dispatch_info.queue_id.handle); + + if(_track && _timing.start > 0 && _timing.end > _timing.start) + { + TRACE_COUNTER(trait::name::value, *_track, + _timing.start, record.record_counter.counter_value); + TRACE_COUNTER(trait::name::value, *_track, + _timing.end, 0); + } +} + +counter_storage::counter_storage(const client_data* _tool_data, uint64_t _devid, + size_t _idx, std::string_view _name) +: tool_data{ _tool_data } +, device_id{ _devid } +, index{ static_cast(_idx) } +, metric_name{ _name } +, metric_description{ get_counter_description(_tool_data, metric_name) } +{ + auto _metric_name = std::string{ _name }; + _metric_name = + std::regex_replace(_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2"); + storage_name = JOIN('-', "rocprof", "device", device_id, _metric_name); + storage = std::make_unique(tim::standalone_storage{}, index, + storage_name); + { + constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_COUNT; + track_name = JOIN(" ", "GPU", _metric_name, JOIN("", '[', device_id, ']')); + track = std::make_unique( + ::perfetto::StaticString(track_name.c_str())); + track->set_is_incremental(false); + track->set_unit(_unit); + track->set_unit_multiplier(1); + } +} + +void +counter_storage::operator()(const counter_event& _event, timing_interval _timing, + scope::config _scope) const +{ + operation::set_storage{}(storage.get()); + _event(tool_data, track.get(), _timing, _scope); +} + +void +counter_storage::write() const +{ + operation::set_storage{}(storage.get()); + counter_data_tracker::label() = metric_name; + counter_data_tracker::description() = metric_description; + storage->write(); +} +} // namespace rocprofiler_sdk +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.hpp new file mode 100644 index 0000000000..bcd1ddcdde --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/counters.hpp @@ -0,0 +1,168 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "common/synchronized.hpp" +#include "core/debug.hpp" +#include "core/perfetto.hpp" +#include "core/timemory.hpp" +#include "library/rocprofiler-sdk/fwd.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +struct counter_dispatch_record +{ + const rocprofiler_dispatch_counting_service_data_t* dispatch_data = nullptr; + rocprofiler_dispatch_id_t dispatch_id = 0; + rocprofiler_counter_id_t counter_id = {}; + rocprofiler_record_counter_t record_counter = {}; +}; + +struct counter_data_tag +{}; + +using counter_data_tracker = component::data_tracker; +using counter_storage_type = typename counter_data_tracker::storage_type; +using counter_bundle_t = tim::lightweight_tuple; +using counter_track_type = ::perfetto::CounterTrack; + +struct counter_event +{ + ROCPROFSYS_DEFAULT_OBJECT(counter_event) + + explicit counter_event(counter_dispatch_record&& _v) + : record{ _v } + {} + + void operator()(const client_data* tool_data, counter_track_type*, + timing_interval _timing, scope::config _scope) const; + + counter_dispatch_record record = {}; +}; + +struct counter_storage +{ + const client_data* tool_data = nullptr; + uint64_t device_id = 0; + int64_t index = 0; + std::string metric_name = {}; + std::string metric_description = {}; + std::string storage_name = {}; + std::string track_name = {}; + std::unique_ptr storage = {}; + std::unique_ptr track = {}; + + counter_storage(const client_data* _tool_data, uint64_t _devid, size_t _idx, + std::string_view _name); + + ~counter_storage() = default; + counter_storage(const counter_storage&) = delete; + counter_storage(counter_storage&&) = default; + counter_storage& operator=(const counter_storage&) = delete; + counter_storage& operator=(counter_storage&&) = default; + + friend bool operator<(const counter_storage& lhs, const counter_storage& rhs) + { + return std::tie(lhs.storage_name, lhs.device_id, lhs.index) < + std::tie(rhs.storage_name, rhs.device_id, rhs.index); + } + + void operator()(const counter_event& _event, timing_interval _timing, + scope::config _scope = scope::get_default()) const; + + void write() const; +}; +} // namespace rocprofiler_sdk +} // namespace rocprofsys + +namespace tim +{ +namespace operation +{ +template <> +struct set_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker> +{ + static constexpr size_t max_threads = 4096; + using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker; + using storage_array_t = std::array*, max_threads>; + friend struct get_storage; + + ROCPROFSYS_DEFAULT_OBJECT(set_storage) + + auto operator()(storage* _v, size_t _idx) const { get().at(_idx) = _v; } + auto operator()(type&, size_t) const {} + auto operator()(storage* _v) const { get().fill(_v); } + +private: + static storage_array_t& get() + { + static storage_array_t _v = { nullptr }; + return _v; + } +}; + +template <> +struct get_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker> +{ + using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker; + + ROCPROFSYS_DEFAULT_OBJECT(get_storage) + + auto operator()(const type&) const + { + return operation::set_storage::get().at(0); + } + + auto operator()() const + { + type _obj{}; + return (*this)(_obj); + } + + auto operator()(size_t _idx) const + { + return operation::set_storage::get().at(_idx); + } + + auto operator()(type&, size_t _idx) const { return (*this)(_idx); } +}; +} // namespace operation +} // namespace tim diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp new file mode 100644 index 0000000000..4120c27b5a --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp @@ -0,0 +1,270 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/rocprofiler-sdk/fwd.hpp" +#include "core/debug.hpp" +#include "core/state.hpp" + +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +using tool_agent_vec_t = std::vector; + +rocprofiler_status_t +dimensions_info_callback(rocprofiler_counter_id_t /*id*/, + const rocprofiler_record_dimension_info_t* dim_info, + long unsigned int num_dims, void* user_data) +{ + auto* dimensions_info = + static_cast*>(user_data); + dimensions_info->reserve(num_dims); + for(size_t j = 0; j < num_dims; j++) + dimensions_info->emplace_back(dim_info[j]); + + return ROCPROFILER_STATUS_SUCCESS; +} + +rocprofiler_status_t +counters_supported_callback(rocprofiler_agent_id_t agent_id, + rocprofiler_counter_id_t* counters, size_t num_counters, + void* user_data) +{ + using value_type = typename agent_counter_info_map_t::mapped_type; + + auto* data_v = static_cast(user_data); + data_v->emplace(agent_id, value_type{}); + for(size_t i = 0; i < num_counters; ++i) + { + auto _info = rocprofiler_counter_info_v0_t{}; + auto _dim_info = std::vector{}; + + ROCPROFILER_CALL(rocprofiler_query_counter_info( + counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &_info)); + + // populate local vector + ROCPROFILER_CALL(rocprofiler_iterate_counter_dimensions( + counters[i], dimensions_info_callback, &_dim_info)); + + if(!_info.is_constant) + data_v->at(agent_id).emplace_back(agent_id, _info, std::move(_dim_info)); + } + return ROCPROFILER_STATUS_SUCCESS; +} + +agent_counter_info_map_t +get_agent_counter_info(const tool_agent_vec_t& _agents) +{ + auto _data = agent_counter_info_map_t{}; + + for(auto itr : _agents) + { + ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( + itr.agent->id, counters_supported_callback, &_data)); + + std::sort(_data.at(itr.agent->id).begin(), _data.at(itr.agent->id).end(), + [](const auto& lhs, const auto& rhs) { + return (lhs.id.handle < rhs.id.handle); + }); + + for(auto& citr : _data.at(itr.agent->id)) + { + std::sort(citr.dimension_info.begin(), citr.dimension_info.end(), + [](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); }); + } + } + + return _data; +} +} // namespace + +rocprofiler_tool_counter_info_t::rocprofiler_tool_counter_info_t( + rocprofiler_agent_id_t _agent_id, parent_type _info, dimension_info_vec_t&& _dim_info) +: parent_type{ _info } +, agent_id{ _agent_id } +, dimension_info{ std::move(_dim_info) } +{} + +void +client_data::initialize() +{ + buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names(); + callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names(); + + static constexpr auto supported_agent_info_version = ROCPROFILER_AGENT_INFO_VERSION_0; + + rocprofiler_query_available_agents_cb_t iterate_cb = + [](rocprofiler_agent_version_t version, const void** agents_arr, + size_t num_agents, void* user_data) { + ROCPROFSYS_CONDITIONAL_ABORT(version != supported_agent_info_version, + "rocprofiler agent info version != expected " + "agent info version (=%i). value: %i\n", + supported_agent_info_version, version); + + auto _agents_v = std::vector{}; + for(size_t i = 0; i < num_agents; ++i) + { + const auto* _agent = + static_cast(agents_arr[i]); + _agents_v.emplace_back(*_agent); + } + + auto* tool_data_v = as_client_data(user_data); + tool_data_v->set_agents(std::move(_agents_v)); + + return ROCPROFILER_STATUS_SUCCESS; + }; + + ROCPROFILER_CALL(rocprofiler_query_available_agents( + supported_agent_info_version, iterate_cb, sizeof(rocprofiler_agent_t), this)); +} + +void +client_data::initialize_event_info() +{ + if(agents.empty()) initialize(); + + if(agent_counter_info.size() != gpu_agents.size()) + agent_counter_info = get_agent_counter_info(gpu_agents); + + try + { + using qualifier_t = tim::hardware_counters::qualifier; + using qualifier_vec_t = std::vector; + + for(const auto& aitr : gpu_agents) + { + auto _dev_index = aitr.device_id; + auto _device_qualifier_sym = JOIN("", ":device=", _dev_index); + auto _device_qualifier = + tim::hardware_counters::qualifier{ true, static_cast(_dev_index), + _device_qualifier_sym, + JOIN(" ", "Device", _dev_index) }; + + auto _counter_info = agent_counter_info.at(aitr.agent->id); + std::sort(_counter_info.begin(), _counter_info.end(), + [](const rocprofiler_tool_counter_info_t& lhs, + const rocprofiler_tool_counter_info_t& rhs) { + if(lhs.is_constant && rhs.is_constant) + return lhs.id < rhs.id; + else if(lhs.is_constant) + return true; + else if(rhs.is_constant) + return false; + + if(!lhs.is_derived && !rhs.is_derived) + return lhs.id < rhs.id; + else if(!lhs.is_derived) + return true; + else if(!rhs.is_derived) + return false; + + return lhs.id < rhs.id; + }); + + for(const auto& ditr : _counter_info) + { + auto _long_desc = std::string{ ditr.description }; + auto _units = std::string{}; + auto _pysym = std::string{}; + if(ditr.is_constant) + { + continue; + } + else if(ditr.is_derived) + { + auto _sym = JOIN("", ditr.name, _device_qualifier_sym); + auto _short_desc = JOIN("", "Derived counter: ", ditr.expression); + events_info.emplace_back(hardware_counter_info( + true, tim::hardware_counters::api::rocm, events_info.size(), 0, + _sym, _pysym, _short_desc, _long_desc, _units, + qualifier_vec_t{ _device_qualifier })); + } + else + { + auto _dim_info = std::vector{}; + + for(const auto& itr : ditr.dimension_info) + { + auto _info = (itr.instance_size > 1) + ? JOIN("", itr.name, "[", 0, ":", + itr.instance_size - 1, "]") + : std::string{}; + if(!_info.empty()) _dim_info.emplace_back(_info); + } + + auto _sym = JOIN("", ditr.name, _device_qualifier_sym); + auto _short_desc = JOIN("", ditr.name, " on device ", _dev_index); + if(!_dim_info.empty()) + { + namespace join = ::timemory::join; + _short_desc += JOIN( + "", ". ", + join::join(join::array_config{ ", ", "", "" }, _dim_info)); + } + events_info.emplace_back(hardware_counter_info( + true, tim::hardware_counters::api::rocm, events_info.size(), 0, + _sym, _pysym, _short_desc, _long_desc, _units, + qualifier_vec_t{ _device_qualifier })); + } + } + } + } catch(std::exception& _e) + { + ROCPROFSYS_WARNING_F(1, "Constructing ROCm event info failed: %s\n", _e.what()); + } +} + +void +client_data::set_agents(agent_vec_t&& _agents_v) +{ + agents = std::move(_agents_v); + + std::sort(agents.begin(), agents.end(), + [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; }); + + cpu_agents.clear(); + gpu_agents.clear(); + + for(const auto& itr : agents) + { + if(itr.type == ROCPROFILER_AGENT_TYPE_CPU) + cpu_agents.emplace_back(tool_agent{ cpu_agents.size(), &itr }); + else if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) + gpu_agents.emplace_back(tool_agent{ gpu_agents.size(), &itr }); + } +} +} // namespace rocprofiler_sdk +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp new file mode 100644 index 0000000000..4a702cb985 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp @@ -0,0 +1,252 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "common/synchronized.hpp" +#include "core/timemory.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +using hardware_counter_info = ::tim::hardware_counters::info; + +using kernel_symbol_data_t = + rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = + std::unordered_map; +using callback_arg_array_t = std::vector>; + +struct code_object_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_callback_tracing_code_object_load_data_t payload = {}; +}; + +struct kernel_symbol_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + kernel_symbol_data_t payload = {}; +}; + +struct rocprofiler_tool_counter_info_t : rocprofiler_counter_info_v0_t +{ + using this_type = rocprofiler_tool_counter_info_t; + using parent_type = rocprofiler_counter_info_v0_t; + using dimension_info_vec_t = std::vector; + + rocprofiler_tool_counter_info_t(rocprofiler_agent_id_t _agent_id, parent_type _info, + dimension_info_vec_t&& _dim_info); + + rocprofiler_tool_counter_info_t() = default; + ~rocprofiler_tool_counter_info_t() = default; + rocprofiler_tool_counter_info_t(const rocprofiler_tool_counter_info_t&) = default; + rocprofiler_tool_counter_info_t(rocprofiler_tool_counter_info_t&&) noexcept = default; + rocprofiler_tool_counter_info_t& operator=(const rocprofiler_tool_counter_info_t&) = + default; + rocprofiler_tool_counter_info_t& operator =( + rocprofiler_tool_counter_info_t&&) noexcept = default; + + rocprofiler_agent_id_t agent_id = {}; + std::vector dimension_info = {}; +}; + +struct tool_agent +{ + uint64_t device_id = 0; + const rocprofiler_agent_v0_t* agent = nullptr; +}; + +struct timing_interval +{ + rocprofiler_timestamp_t start = 0; + rocprofiler_timestamp_t end = 0; +}; + +using agent_counter_info_map_t = + std::unordered_map>; + +using agent_counter_profile_map_t = + std::unordered_map>; + +using counter_id_vec_t = std::vector; + +using agent_counter_id_map_t = + std::unordered_map; + +using backtrace_operation_map_t = + std::unordered_map>; + +struct client_data +{ + static constexpr size_t num_buffers = 3; + static constexpr size_t num_contexts = 2; + + using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t; + using callback_name_info_t = rocprofiler::sdk::callback_name_info_t; + using kernel_symbol_vec_t = std::vector; + using code_object_vec_t = std::vector; + using buffer_id_vec_t = std::array; + using context_id_vec_t = std::array; + using agent_vec_t = std::vector; + + rocprofiler_client_id_t* client_id = nullptr; + rocprofiler_client_finalize_t client_fini = nullptr; + rocprofiler_context_id_t primary_ctx = { 0 }; + rocprofiler_context_id_t counter_ctx = { 0 }; + rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 }; + rocprofiler_buffer_id_t memory_copy_buffer = { 0 }; + rocprofiler_buffer_id_t counter_collection_buffer = { 0 }; + std::vector agents = {}; + std::vector cpu_agents = {}; + std::vector gpu_agents = {}; + std::vector events_info = {}; + agent_counter_id_map_t agent_events = {}; + agent_counter_info_map_t agent_counter_info = {}; + agent_counter_profile_map_t agent_counter_profiles = {}; + common::synchronized code_object_records = {}; + common::synchronized kernel_symbol_records = {}; + buffer_name_info_t buffered_tracing_info = {}; + callback_name_info_t callback_tracing_info = {}; + backtrace_operation_map_t backtrace_operations = {}; + + void initialize(); + void initialize_event_info(); + void set_agents(agent_vec_t&& agents); + context_id_vec_t get_contexts() const; + buffer_id_vec_t get_buffers() const; + const rocprofiler_agent_t* get_agent(rocprofiler_agent_id_t _id) const; + const tool_agent* get_gpu_tool_agent(rocprofiler_agent_id_t id) const; + const kernel_symbol_data_t* get_kernel_symbol_info(uint64_t _kernel_id) const; + const rocprofiler_tool_counter_info_t* get_tool_counter_info( + rocprofiler_agent_id_t _agent_id, rocprofiler_counter_id_t _counter_id) const; +}; + +inline client_data::context_id_vec_t +client_data::get_contexts() const +{ + return context_id_vec_t{ + primary_ctx, + counter_ctx, + }; +} + +inline client_data::buffer_id_vec_t +client_data::get_buffers() const +{ + return buffer_id_vec_t{ + kernel_dispatch_buffer, + memory_copy_buffer, + counter_collection_buffer, + }; +} + +inline const rocprofiler_agent_t* +client_data::get_agent(rocprofiler_agent_id_t _id) const +{ + for(const auto& itr : agents) + if(itr.id == _id) return &itr; + return nullptr; +} + +inline const tool_agent* +client_data::get_gpu_tool_agent(rocprofiler_agent_id_t id) const +{ + for(const auto& itr : gpu_agents) + if(id == itr.agent->id) return &itr; + return nullptr; +} + +inline const kernel_symbol_data_t* +client_data::get_kernel_symbol_info(uint64_t _kernel_id) const +{ + return kernel_symbol_records.rlock( + [_kernel_id](const auto& _data) -> const kernel_symbol_data_t* { + for(const auto& itr : _data) + { + if(_kernel_id == itr->payload.kernel_id) + { + return &itr->payload; + break; + } + } + return nullptr; + }); +} + +inline const rocprofiler_tool_counter_info_t* +client_data::get_tool_counter_info(rocprofiler_agent_id_t _agent_id, + rocprofiler_counter_id_t _counter_id) const +{ + for(const auto& itr : agent_counter_info.at(_agent_id)) + { + if(itr.id == _counter_id) return &itr; + } + return nullptr; +} + +inline constexpr client_data* +as_client_data(void* _ptr) +{ + return static_cast(_ptr); +} +} // namespace rocprofiler_sdk +} // namespace rocprofsys + +#if !defined(ROCPROFILER_CALL) +# define ROCPROFILER_CALL(result) \ + { \ + rocprofiler_status_t ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) = \ + (result); \ + if(ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) != \ + ROCPROFILER_STATUS_SUCCESS) \ + { \ + auto msg = std::stringstream{}; \ + std::string status_msg = rocprofiler_get_status_string( \ + ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__)); \ + msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \ + << "rocprofiler-sdk call [" << #result \ + << "] failed with error code " \ + << ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) \ + << " :: " << status_msg; \ + ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \ + } \ + } +#endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.cpp deleted file mode 100644 index 5e155b874b..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler.cpp +++ /dev/null @@ -1,834 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/rocprofiler.hpp" -#include "core/common.hpp" -#include "core/config.hpp" -#include "core/debug.hpp" -#include "core/gpu.hpp" -#include "core/perfetto.hpp" -#include "library/rocm.hpp" -#include "library/rocm/hsa_rsrc_factory.hpp" - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace rocprofsys -{ -namespace rocprofiler -{ -namespace -{ -using ::rocprofiler::util::AgentInfo; -using ::rocprofiler::util::HsaRsrcFactory; - -auto& -get_event_names() -{ - static auto _v = std::map>{}; - return _v; -} -} // namespace - -// Error handler -void -fatal(const std::string& msg) -{ - ROCPROFSYS_PRINT_F("\n"); - ROCPROFSYS_PRINT_F("%s\n", msg.c_str()); - abort(); -} - -// Check returned HSA API status -const char* -rocm_error_string(hsa_status_t _status) -{ - const char* _err_string = nullptr; - if(_status != HSA_STATUS_SUCCESS) rocprofiler_error_string(&_err_string); - return _err_string; -} - -// Check returned HSA API status -bool -rocm_check_status(hsa_status_t _status, const std::set& _nonfatal = {}) -{ - if(_status != HSA_STATUS_SUCCESS) - { - if(_nonfatal.count(_status) == 0) - fatal(JOIN(" :: ", "ERROR", rocm_error_string(_status))); - - ROCPROFSYS_PRINT_F("Warning! %s\n", rocm_error_string(_status)); - return false; - } - return true; -} - -// Context stored entry type -struct context_entry_t -{ - bool valid; - hsa_agent_t agent; - rocprofiler_group_t group; - rocprofiler_callback_data_t data; -}; - -// Context callback arg -struct callbacks_arg_t -{ - rocprofiler_pool_t** pools; -}; - -// Handler callback arg -struct handler_arg_t -{ - rocprofiler_feature_t* features; - unsigned feature_count; -}; - -bool& -is_setup() -{ - static bool _v = false; - return _v; -} - -std::map> -get_data_labels() -{ - auto _v = std::map>{}; - for(const auto& itr : get_event_names()) - { - _v[itr.first] = {}; - for(auto vitr : itr.second) - _v[itr.first].emplace_back(std::string_view{ vitr.name }); - } - return _v; -} - -// Dump stored context entry -void -rocm_dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, - unsigned feature_count) -{ - volatile std::atomic* valid = - reinterpret_cast*>(&entry->valid); - while(valid->load() == false) - sched_yield(); - - const rocprofiler_dispatch_record_t* record = entry->data.record; - - if(!record) return; // there is nothing to do here. - - auto _queue_id = entry->data.queue_id; - auto _thread_id = entry->data.thread_id; - auto _dev_id = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index; - auto _kernel_name = std::string{ entry->data.kernel_name }; - auto _pos = _kernel_name.find_last_of(')'); - if(_pos != std::string::npos) _kernel_name = _kernel_name.substr(0, _pos + 1); - - rocprofiler_group_t& group = entry->group; - if(group.context == nullptr) - { - fatal("context is nullptr\n"); - } - - if(feature_count > 0) - { - rocm_check_status(rocprofiler_group_get_data(&group)); - rocm_check_status(rocprofiler_get_metrics(group.context)); - } - - auto _evt = - component::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name, - record->begin, record->end, feature_count, features }; - - component::rocm_data()->emplace_back(_evt); -} - -// Profiling completion handler -// Dump and delete the context entry -// Return true if the context was dumped successfully -bool -rocm_context_handler(const rocprofiler_pool_entry_t* entry, void* arg) -{ - // Context entry - context_entry_t* ctx_entry = reinterpret_cast(entry->payload); - handler_arg_t* handler_arg = reinterpret_cast(arg); - - // rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock }; - // if(!_lk.owns_lock()) _lk.lock(); - - rocm_dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count); - - return true; -} - -// Kernel disoatch callback -hsa_status_t -rocm_dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg, - rocprofiler_group_t* group) -{ - // Passed tool data - hsa_agent_t agent = callback_data->agent; - - // Open profiling context - const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; - callbacks_arg_t* callbacks_arg = reinterpret_cast(arg); - rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; - rocprofiler_pool_entry_t pool_entry{}; - rocm_check_status(rocprofiler_pool_fetch(pool, &pool_entry)); - // Profiling context entry - rocprofiler_t* context = pool_entry.context; - context_entry_t* entry = reinterpret_cast(pool_entry.payload); - - // Get group[0] - rocm_check_status(rocprofiler_get_group(context, 0, group)); - - // Fill profiling context entry - entry->agent = agent; - entry->group = *group; - entry->data = *callback_data; - entry->data.kernel_name = strdup(callback_data->kernel_name); - reinterpret_cast*>(&entry->valid)->store(true); - - return HSA_STATUS_SUCCESS; -} - -unsigned -metrics_input(unsigned _device, rocprofiler_feature_t** ret) -{ - // Profiling feature objects - auto _events = tim::delimit(config::get_rocm_events(), ", ;\t\n"); - std::vector _features = {}; - auto _this_device = JOIN("", ":device=", _device); - for(auto itr : _events) - { - ROCPROFSYS_VERBOSE_F(3, "Processing feature '%s' for device %u...\n", itr.c_str(), - _device); - auto _pos = itr.find(":device="); - if(_pos != std::string::npos) - { - if(itr.find(_this_device) != std::string::npos) - { - _features.emplace_back(itr.substr(0, _pos)); - } - } - else - { - _features.emplace_back(itr); - } - } - const unsigned feature_count = _features.size(); - rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; - memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); - - // PMC events - for(unsigned i = 0; i < feature_count; ++i) - { - ROCPROFSYS_VERBOSE_F(3, "Adding feature '%s' for device %u...\n", - _features.at(i).c_str(), _device); - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = strdup(_features.at(i).c_str()); - features[i].parameters = nullptr; - features[i].parameter_count = 0; - } - - *ret = features; - return feature_count; -} - -using info_data = std::vector; - -hsa_status_t -info_data_callback(const rocprofiler_info_data_t info, void* arg) -{ - using qualifier_t = tim::hardware_counters::qualifier; - using qualifier_vec_t = std::vector; - auto* _data = static_cast(arg); - auto _dev_index = info.agent_index; - - switch(info.kind) - { - case ROCPROFILER_INFO_KIND_METRIC: - { - auto _device_qualifier_sym = JOIN("", ":device=", _dev_index); - auto _device_qualifier = - tim::hardware_counters::qualifier{ true, static_cast(_dev_index), - _device_qualifier_sym, - JOIN(" ", "Device", _dev_index) }; - auto _long_desc = std::string{ info.metric.description }; - auto _units = std::string{}; - auto _pysym = std::string{}; - if(info.metric.expr != nullptr) - { - auto _sym = JOIN("", info.metric.name, _device_qualifier_sym); - auto _short_desc = JOIN("", "Derived counter: ", info.metric.expr); - _data->emplace_back(component::rocm_info_entry( - true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, - _pysym, _short_desc, _long_desc, _units, - qualifier_vec_t{ _device_qualifier })); - } - else - { - if(info.metric.instances == 1) - { - auto _sym = JOIN("", info.metric.name, _device_qualifier_sym); - auto _short_desc = - JOIN("", info.metric.name, " on device ", _dev_index); - _data->emplace_back(component::rocm_info_entry( - true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, - _pysym, _short_desc, _long_desc, _units, - qualifier_vec_t{ _device_qualifier })); - } - else - { - for(uint32_t i = 0; i < info.metric.instances; ++i) - { - auto _instance_qualifier_sym = JOIN("", '[', i, ']'); - auto _instance_qualifier = - tim::hardware_counters::qualifier{ true, static_cast(i), - _instance_qualifier_sym, - JOIN(" ", "Instance", i) }; - auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym, - _device_qualifier_sym); - auto _short_desc = JOIN("", info.metric.name, " instance ", i, - " on device ", _dev_index); - _data->emplace_back(component::rocm_info_entry( - true, tim::hardware_counters::api::rocm, _data->size(), 0, - _sym, _pysym, _short_desc, _long_desc, _units, - qualifier_vec_t{ _device_qualifier, _instance_qualifier })); - } - } - } - break; - } - default: printf("wrong info kind %u\n", info.kind); return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; -} - -std::vector -rocm_metrics() -{ - std::vector _data = {}; - try - { - (void) HsaRsrcFactory::Instance(); - } catch(std::runtime_error& _e) - { - ROCPROFSYS_VERBOSE_F(0, "%s\n", _e.what()); - return _data; - } - - // Available GPU agents - const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); - - std::vector _gpu_agents(gpu_count, nullptr); - for(unsigned i = 0; i < gpu_count; ++i) - { - const AgentInfo* _agent = _gpu_agents[i]; - const AgentInfo** _agent_p = &_agent; - HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p); - - if(!rocm_check_status(rocprofiler_iterate_info( - &_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC, - info_data_callback, reinterpret_cast(&_data)), - { HSA_STATUS_ERROR_NOT_INITIALIZED })) - { - ROCPROFSYS_WARNING_F(-1, "rocprofiler_iterate_info failed for gpu agent %u\n", - i); - } - } - - if(gpu_count > 0 && _data.empty()) - { - if(!rocm_check_status(rocprofiler_iterate_info( - nullptr, ROCPROFILER_INFO_KIND_METRIC, - info_data_callback, reinterpret_cast(&_data)), - { HSA_STATUS_ERROR_NOT_INITIALIZED })) - { - ROCPROFSYS_WARNING_F( - -1, "rocprofiler_iterate_info failed for %i gpu agents\n", gpu_count); - } - } - - auto _settings = tim::settings::shared_instance(); - if(_settings) - { - auto ritr = _settings->find("ROCPROFSYS_ROCM_EVENTS"); - if(ritr != _settings->end()) - { - auto _rocm_events = ritr->second; - if(_rocm_events->get_choices().empty()) - { - std::vector _choices = {}; - _choices.reserve(_data.size()); - for(auto itr : _data) - { - if(!itr.symbol().empty()) _choices.emplace_back(itr.symbol()); - } - _rocm_events->set_choices(_choices); - } - } - } - - return _data; -} - -void -rocm_initialize() -{ - // Available GPU agents - const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); - - (void) rocm_metrics(); - - // Adding dispatch observer - callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; - callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count]; - for(unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) - { - // Getting profiling features - rocprofiler_feature_t* features = nullptr; - unsigned feature_count = metrics_input(gpu_id, &features); - - if(features) - { - get_event_names()[gpu_id].clear(); - get_event_names()[gpu_id].reserve(feature_count); - for(unsigned i = 0; i < feature_count; ++i) - get_event_names().at(gpu_id).emplace_back(features[i]); - } - - // Handler arg - handler_arg_t* handler_arg = new handler_arg_t{}; - handler_arg->features = features; - handler_arg->feature_count = feature_count; - - // Context properties - rocprofiler_pool_properties_t properties{}; - properties.num_entries = 100; - properties.payload_bytes = sizeof(context_entry_t); - properties.handler = rocm_context_handler; - properties.handler_arg = handler_arg; - - // Getting GPU device info - const AgentInfo* agent_info = nullptr; - if(HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) - { - fprintf(stderr, "GetGpuAgentInfo failed\n"); - abort(); - } - - // Open profiling pool - rocprofiler_pool_t* pool = nullptr; - uint32_t mode = 0; // ROCPROFILER_MODE_SINGLEGROUP - rocm_check_status(rocprofiler_pool_open(agent_info->dev_id, features, - feature_count, &pool, mode, &properties)); - callbacks_arg->pools[gpu_id] = pool; - } - - rocprofiler_queue_callbacks_t callbacks_ptrs{}; - callbacks_ptrs.dispatch = rocm_dispatch_callback; - int err = rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); - ROCPROFSYS_VERBOSE_F(3, "err=%d, rocprofiler_set_queue_callbacks\n", err); - - is_setup() = true; -} - -void -rocm_cleanup() -{ - // Unregister dispatch callback - rocm_check_status(rocprofiler_remove_queue_callbacks()); - // close profiling pool - // rocm_check_status(rocprofiler_pool_flush(pool)); - // rocm_check_status(rocprofiler_pool_close(pool)); -} - -namespace -{ -using rocm_event = component::rocm_event; -using rocm_data_t = component::rocm_data_t; -using rocm_metric_type = component::rocm_metric_type; -using rocm_feature_value = component::rocm_feature_value; -using rocm_data_tracker = component::rocm_data_tracker; - -void -post_process_perfetto() -{ - using counter_track = perfetto_counter_track; - - static bool _once = false; - if(_once) return; - - auto _data = rocm_data_t{}; - auto _device_data = std::map>{}; - auto _device_fields = std::map>{}; - auto _device_range = std::map>{}; - - for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i) - { - auto& _v = component::rocm_data(i); - if(_v) - { - _data.reserve(_data.size() + _v->size()); - for(auto& itr : *_v) - _data.emplace_back(itr); - } - } - - if(_data.empty()) return; - _once = true; - - std::sort(_data.begin(), _data.end()); - - auto _get_events = [](std::vector& _inp, rocm_metric_type _ts) { - auto _v = std::vector{}; - for(const auto& itr : _inp) - { - if(_ts >= itr->entry && _ts <= itr->exit) _v.emplace_back(itr); - if(_ts > itr->exit) break; - } - return _v; - }; - - { - auto _device_time = std::map>{}; - for(auto& itr : _data) - { - _device_data[itr.device_id].emplace_back(&itr); - _device_time[itr.device_id].emplace(itr.entry); - _device_time[itr.device_id].emplace(itr.exit); - auto _dev_id = itr.device_id; - if(get_use_perfetto() && !counter_track::exists(_dev_id)) - { - auto addendum = [&](auto&& _v) { - return JOIN(" ", "Device", _v, JOIN("", '[', _dev_id, ']')); - }; - for(auto nitr : itr.feature_names) - { - auto _name = get_data_labels().at(itr.device_id).at(nitr); - counter_track::emplace(_dev_id, addendum(_name)); - } - } - } - - for(auto& ditr : _device_time) - { - for(auto itr = ditr.second.begin(); itr != ditr.second.end(); ++itr) - { - auto _next = std::next(itr); - if(_next == ditr.second.end()) continue; - _device_range[ditr.first].emplace(((*_next / 2) + (*itr / 2))); - } - } - } - - for(auto& ditr : _device_range) - { - auto _dev_id = ditr.first; - auto _values = std::vector{}; - auto _ts_sorted_data = _device_data[_dev_id]; - std::sort(_ts_sorted_data.begin(), _ts_sorted_data.end(), - [](auto* _l, auto* _r) { return _l->exit < _r->exit; }); - for(const auto& itr : ditr.second) - { - auto _v = _get_events(_ts_sorted_data, itr); - uint64_t _ts = itr; - for(auto* vitr : _v) - { - size_t _n = vitr->feature_values.size(); - if(_values.empty()) - { - _values.reserve(_n); - for(size_t i = 0; i < _n; ++i) - { - _values.emplace_back(vitr->feature_values.at(i)); - } - } - else - { - for(size_t i = 0; i < _n; ++i) - { -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wdouble-promotion" -#endif - auto _plus = [](auto& _lhs, auto&& _rhs) { _lhs += _rhs; }; - std::visit(_plus, _values.at(i), vitr->feature_values.at(i)); -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif - } - } - } - - for(size_t i = 0; i < _values.size(); ++i) - { - auto _trace_counter = [_dev_id, i, _ts](auto&& _val) { - TRACE_COUNTER("kernel_hardware_counter", - counter_track::at(_dev_id, i), _ts, _val); - }; - std::visit(_trace_counter, _values.at(i)); - } - } - } -} - -void -post_process_timemory() -{ - static bool _once = false; - if(_once) return; - - auto _data = rocm_data_t{}; - auto _device_data = std::map>{}; - auto _device_fields = std::map>{}; - auto _device_range = std::map>{}; - - for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i) - { - auto& _v = component::rocm_data(i); - if(_v) - { - _data.reserve(_data.size() + _v->size()); - for(auto& itr : *_v) - _data.emplace_back(itr); - } - } - - if(_data.empty()) return; - _once = true; - - std::sort(_data.begin(), _data.end()); - - for(auto& itr : _data) - { - _device_data[itr.device_id].emplace_back(&itr); - } - - for(auto& itr : _device_data) - { - // sort according to when it exited - std::sort(itr.second.begin(), itr.second.end(), - [](auto* _lhs, auto* _rhs) { return _lhs->exit < _rhs->exit; }); - } - - using storage_type = typename rocm_data_tracker::storage_type; - using bundle_type = tim::lightweight_tuple; - - auto _info = rocm_metrics(); - static auto _get_description = [&_info](std::string_view _v) { - for(auto& itr : _info) - { - if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0) - { - return itr.long_description(); - } - } - return std::string{}; - }; - - struct local_event - { - rocm_event* parent = nullptr; - mutable std::vector children = {}; - - ROCPROFSYS_DEFAULT_OBJECT(local_event) - - explicit local_event(rocm_event* _v) - : parent{ _v } - {} - - bool operator()(rocm_event* _v) - { - if(!parent) return false; - if(_v->device_id != parent->device_id) return false; - if(_v->entry > parent->entry && _v->exit <= parent->exit) - { - children.emplace_back(_v); - return true; - } - return false; - } - - bool operator<(const local_event& _v) const - { - if(!parent && _v.parent) return true; - if(parent && !_v.parent) return false; - return *parent < *_v.parent; - } - - void operator()(int64_t _index, scope::config _scope) const - { - if(!parent) return; - bundle_type _bundle{ parent->name, _scope }; - _bundle.push(parent->queue_id) - .start() - .store(parent->feature_values.at(_index)); - - std::sort(children.begin(), children.end()); - for(const auto& itr : children) - itr(_index, _scope); - - _bundle.stop().pop(parent->queue_id); - } - }; - - struct local_storage - { - int64_t index = 0; - std::string metric_name = {}; - std::string metric_description = {}; - std::unique_ptr storage = {}; - - local_storage(uint32_t _devid, size_t _idx, std::string_view _name) - : index{ static_cast(_idx) } - , metric_name{ _name } - , metric_description{ _get_description(metric_name) } - { - auto _metric_name = std::string{ _name }; - _metric_name = std::regex_replace( - _metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2"); - storage = std::make_unique( - tim::standalone_storage{}, index, - JOIN('-', "rocprof", "device", _devid, _metric_name)); - } - - void operator()(const local_event& _event, scope::config _scope) const - { - operation::set_storage{}(storage.get()); - _event(index, _scope); - } - - void write() const - { - rocm_data_tracker::label() = metric_name; - rocm_data_tracker::description() = metric_description; - storage->write(); - } - }; - - auto _local_data = std::map>{}; - auto _scope = scope::get_default(); - - for(auto& ditr : _device_data) - { - ROCPROFSYS_VERBOSE_F(1, "Post-processing %zu entries for device %u...\n", - ditr.second.size(), ditr.first); - auto _storage = std::vector{}; - for(auto& itr : ditr.second) - { - auto _n = itr->feature_names.size(); - if(_n > _storage.size()) - { - _storage.reserve(_n); - for(size_t i = _storage.size(); i < _n; ++i) - _storage.emplace_back( - ditr.first, i, - get_data_labels().at(ditr.first).at(itr->feature_names.at(i))); - } - } - - auto& _local = _local_data[ditr.first]; - _local.reserve(ditr.second.size()); - double _avg = 0.0; - for(auto& itr : ditr.second) - { - if(_local.empty() || itr->entry >= _local.back().parent->exit) - { - _local.emplace_back(itr); - } - else - { - size_t _n = 0; - bool _found = false; - for(auto litr = _local.rbegin(); litr != _local.rend(); ++litr) - { - ++_n; - if((*litr)(itr)) - { - _found = true; - break; - } - } - if(!_found) _local.emplace_back(itr); - _avg += _n; - } - } - - ROCPROFSYS_VERBOSE_F(3, "Average # of iterations before match: %.1f\n", - _avg / ditr.second.size() * 100.0); - - for(auto& sitr : _storage) - { - for(auto& itr : _local) - sitr(itr, _scope); - } - - for(auto& itr : _storage) - itr.write(); - } - - tim::trait::runtime_enabled::set(false); -} -} // namespace - -void -post_process() -{ - if(get_use_perfetto()) post_process_perfetto(); - - if(get_use_timemory()) - { - auto _manager = tim::manager::master_instance(); - if(_manager) - { - _manager->add_cleanup("rocprofiler", &post_process_timemory); - } - else - { - post_process_timemory(); - } - } -} -} // namespace rocprofiler -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.cpp deleted file mode 100644 index b7abcf713a..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.cpp +++ /dev/null @@ -1,967 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/roctracer.hpp" -#include "binary/analysis.hpp" -#include "core/components/fwd.hpp" -#include "core/concepts.hpp" -#include "core/config.hpp" -#include "core/debug.hpp" -#include "core/locking.hpp" -#include "library/components/category_region.hpp" -#include "library/runtime.hpp" -#include "library/sampling.hpp" -#include "library/thread_data.hpp" -#include "library/thread_info.hpp" -#include "library/tracing.hpp" - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#if ROCPROFSYS_HIP_VERSION < 50300 -# include -#endif - -#define AMD_INTERNAL_BUILD 1 -#include - -#if __has_include() || (defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0) -# include -# define ROCPROFSYS_HIP_API_ARGS 1 -#else -# define ROCPROFSYS_HIP_API_ARGS 0 -#endif - -TIMEMORY_DEFINE_API(roctracer) -namespace rocprofsys -{ -namespace -{ -template -auto& -roctracer_type_mutex() -{ - return tim::type_mutex(); -} - -std::string -hip_api_string(hip_api_id_t id, const hip_api_data_t* data) -{ -#if ROCPROFSYS_HIP_API_ARGS > 0 - std::string _v = hipApiString(id, data); - if(_v.empty()) return _v; - auto _pbeg = _v.find('('); - if(_pbeg == std::string::npos) return _v; - auto _pend = _v.find_last_of(')'); - if(_pend == std::string::npos || _pbeg >= _pend) return _v; - auto _n = (_pend - _pbeg - 1); - return _v.substr(_pbeg + 1, _n); -#else - tim::consume_parameters(id, data); -#endif -} - -int& -get_current_device() -{ - static thread_local int _v = 1; - return _v; -} - -std::unordered_set& -get_roctracer_kernels() -{ - static auto _v = std::unordered_set{}; - return _v; -} - -auto& -get_roctracer_hip_data(int64_t _tid = threading::get_id()) -{ - using data_t = std::unordered_map; - using thread_data_t = thread_data; - return thread_data_t::instance(construct_on_thread{ _tid }); -} - -std::unordered_map& -get_roctracer_key_data() -{ - static auto _v = std::unordered_map{}; - return _v; -} - -std::unordered_map& -get_roctracer_tid_data() -{ - static auto _v = std::unordered_map{}; - return _v; -} - -auto& -get_hip_activity_callbacks(int64_t _tid = threading::get_id()) -{ - using thread_data_t = - thread_data>, category::roctracer>; - return thread_data_t::instance(construct_on_thread{ _tid }); -} - -size_t -get_hip_activity_callbacks_size() -{ - using thread_data_t = - thread_data>, category::roctracer>; - return thread_data_t::size(); -} - -using hip_activity_mutex_t = std::decay_t; -using key_data_mutex_t = std::decay_t; - -auto& -get_hip_activity_mutex(int64_t _tid = threading::get_id()) -{ - return tim::type_mutex( - _tid % max_supported_threads); -} -} // namespace - -// -int64_t -get_clock_skew() -{ - static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true); - if(!_use) return 0; - static auto _v = []() { - auto _gpu_now = []() { - uint64_t _ts = 0; - roctracer_get_timestamp(&_ts); - return _ts; - }; - - // discard (warm-up) - (void) tracing::get_clock_skew(_gpu_now, 1); - - auto _diff = tracing::get_clock_skew(_gpu_now, 10); - ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff, - _use ? "yes" : "no"); - return _diff; - }(); - return _v; -} - -// HSA API callback function -void -hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) -{ - if(get_state() != State::Active || !trait::runtime_enabled::get()) - return; - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - - (void) arg; - const hsa_api_data_t* data = reinterpret_cast(callback_data); - ROCPROFSYS_CONDITIONAL_PRINT_F( - get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n", - roctracer_op_string(domain, cid, 0), cid, data->correlation_id, - (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); - - static thread_local int64_t begin_timestamp = 0; - - switch(cid) - { - case HSA_API_ID_hsa_init: - case HSA_API_ID_hsa_shut_down: - case HSA_API_ID_hsa_agent_get_exception_policies: - case HSA_API_ID_hsa_agent_get_info: - case HSA_API_ID_hsa_amd_agent_iterate_memory_pools: - case HSA_API_ID_hsa_amd_agent_memory_pool_get_info: - case HSA_API_ID_hsa_amd_coherency_get_type: - case HSA_API_ID_hsa_amd_memory_pool_get_info: - case HSA_API_ID_hsa_amd_pointer_info: - case HSA_API_ID_hsa_amd_pointer_info_set_userdata: - case HSA_API_ID_hsa_amd_profiling_async_copy_enable: - case HSA_API_ID_hsa_amd_profiling_get_async_copy_time: - case HSA_API_ID_hsa_amd_profiling_get_dispatch_time: - case HSA_API_ID_hsa_amd_profiling_set_profiler_enabled: - case HSA_API_ID_hsa_cache_get_info: - case HSA_API_ID_hsa_code_object_get_info: - case HSA_API_ID_hsa_code_object_get_symbol: - case HSA_API_ID_hsa_code_object_get_symbol_from_name: - case HSA_API_ID_hsa_code_object_reader_create_from_memory: - case HSA_API_ID_hsa_code_symbol_get_info: - case HSA_API_ID_hsa_executable_create_alt: - case HSA_API_ID_hsa_executable_freeze: - case HSA_API_ID_hsa_executable_get_info: - case HSA_API_ID_hsa_executable_get_symbol: - case HSA_API_ID_hsa_executable_get_symbol_by_name: - case HSA_API_ID_hsa_executable_symbol_get_info: - case HSA_API_ID_hsa_extension_get_name: - case HSA_API_ID_hsa_ext_image_data_get_info: - case HSA_API_ID_hsa_ext_image_data_get_info_with_layout: - case HSA_API_ID_hsa_ext_image_get_capability: - case HSA_API_ID_hsa_ext_image_get_capability_with_layout: - case HSA_API_ID_hsa_isa_get_exception_policies: - case HSA_API_ID_hsa_isa_get_info: - case HSA_API_ID_hsa_isa_get_info_alt: - case HSA_API_ID_hsa_isa_get_round_method: - case HSA_API_ID_hsa_region_get_info: - case HSA_API_ID_hsa_system_extension_supported: - case HSA_API_ID_hsa_system_get_extension_table: - case HSA_API_ID_hsa_system_get_info: - case HSA_API_ID_hsa_system_get_major_extension_table: - case HSA_API_ID_hsa_wavefront_get_info: break; - default: - { - if(data->phase == ACTIVITY_API_PHASE_ENTER) - { - begin_timestamp = comp::wall_clock::record(); - } - else - { - const auto* _name = roctracer_op_string(domain, cid, 0); - const auto end_timestamp = (cid == HSA_API_ID_hsa_shut_down) - ? begin_timestamp - : comp::wall_clock::record(); - - if(begin_timestamp > end_timestamp) return; - - if(get_use_perfetto()) - { - uint64_t _beg_ts = begin_timestamp; - uint64_t _end_ts = end_timestamp; - tracing::push_perfetto_ts(category::rocm_hsa{}, _name, _beg_ts, - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation( - ctx, "begin_ns", _beg_ts); - } - }); - tracing::pop_perfetto_ts(category::rocm_hsa{}, _name, _end_ts, - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation( - ctx, "end_ns", _end_ts); - } - }); - } - - if(get_use_timemory()) - { - auto _beg_ns = begin_timestamp; - auto _end_ns = end_timestamp; - if(tasking::roctracer::get_task_group().pool()) - tasking::roctracer::get_task_group().exec( - [_name, _beg_ns, _end_ns]() { - roctracer_hsa_bundle_t _bundle{ _name }; - _bundle.start() - .store(std::plus{}, - static_cast(_end_ns - _beg_ns)) - .stop(); - }); - } - // timemory is disabled in this callback because collecting data in this - // thread causes strange segmentation faults - } - } - } -} - -void -hsa_activity_callback(uint32_t op, const void* vrecord, void* arg) -{ - const auto* record = static_cast(vrecord); - - if(get_state() != State::Active || !trait::runtime_enabled::get()) - return; - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - - auto&& _protect = comp::roctracer::protect_flush_activity(); - (void) _protect; - - static const char* copy_op_name = "hsa_async_copy"; - static const char* dispatch_op_name = "hsa_dispatch"; - static const char* barrier_op_name = "hsa_barrier"; - const char** _name = nullptr; - - switch(op) - { - case HSA_OP_ID_DISPATCH: _name = &dispatch_op_name; break; - case HSA_OP_ID_COPY: _name = ©_op_name; break; - case HSA_OP_ID_BARRIER: _name = &barrier_op_name; break; - default: break; - } - - ROCPROFSYS_CI_FAIL(_name == nullptr, "Error! HSA operation type not handled: %u\n", - op); - - if(!_name) return; - - auto _beg_ns = record->begin_ns + get_clock_skew(); - auto _end_ns = record->end_ns + get_clock_skew(); - - if(get_use_perfetto()) - { - uint64_t _beg = _beg_ns; - uint64_t _end = _end_ns; - tracing::push_perfetto_ts( - category::device_hsa{}, *_name, _beg, [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); - } - }); - tracing::pop_perfetto_ts( - category::device_hsa{}, *_name, _end, [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "end_ns", _end); - } - }); - } - - auto _func = [_beg_ns, _end_ns, _name]() { - if(get_use_timemory()) - { - roctracer_hsa_bundle_t _bundle{ *_name }; - _bundle.start() - .store(std::plus{}, static_cast(_end_ns - _beg_ns)) - .stop(); - } - }; - - if(tasking::roctracer::get_task_group().pool()) - tasking::roctracer::get_task_group().exec(_func); - - // timemory is disabled in this callback because collecting data in this thread - // causes strange segmentation faults - tim::consume_parameters(arg); -} - -void -hip_exec_activity_callbacks(int64_t _tid) -{ - // guard against initialization of structure when trying to exec - if(static_cast(_tid) >= get_hip_activity_callbacks_size()) return; - - // ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity()); - locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) }; - auto& _async_ops = get_hip_activity_callbacks(_tid); - if(!_async_ops) return; - for(auto& itr : *_async_ops) - { - if(itr) itr(); - } - _async_ops->clear(); -} - -namespace -{ -thread_local std::unordered_map gpu_crit_cids = {}; -} - -void -roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, - void* /*arg*/) -{ - if(get_state() != State::Active || !trait::runtime_enabled::get()) - return; - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - - if(domain != ACTIVITY_DOMAIN_ROCTX) return; - - static auto _range_map = std::unordered_map{}; - static auto _range_lock = locking::atomic_mutex{}; - const auto* _data = reinterpret_cast(callback_data); - static thread_local auto _range_stack = std::vector{}; - - switch(cid) - { - case ROCTX_API_ID_roctxRangePushA: - { - if(_data->args.message) - { - auto& itr = _range_stack.emplace_back(std::string{ _data->args.message }); - component::category_region::start(itr.c_str()); - } - break; - } - case ROCTX_API_ID_roctxRangePop: - { - if(!_range_stack.empty()) - { - auto& itr = _range_stack.back(); - component::category_region::stop(itr.c_str()); - _range_stack.pop_back(); - } - else - { - ROCPROFSYS_THROW("Error! roctxRangePop stack is empty! Expected " - "roctxRangePush/roctxRangePop on same thread\n"); - } - break; - } - case ROCTX_API_ID_roctxRangeStartA: - { - { - locking::atomic_lock _lk{ _range_lock, std::defer_lock }; - if(!_lk.owns_lock()) _lk.lock(); - _range_map.emplace(roctx_range_id_t{ _data->args.id }, - std::string{ _data->args.message }); - } - - component::category_region::start(_data->args.message); - break; - } - case ROCTX_API_ID_roctxRangeStop: - { - std::string_view _message = {}; - { - locking::atomic_lock _lk{ _range_lock, std::defer_lock }; - if(!_lk.owns_lock()) _lk.lock(); - auto itr = _range_map.find(roctx_range_id_t{ _data->args.id }); - ROCPROFSYS_CI_THROW(itr == _range_map.end(), - "Error! could not find range with id %lu\n", - _data->args.id); - if(itr == _range_map.end()) - { - ROCPROFSYS_VERBOSE(0, "Warning! could not find range with id %lu\n", - _data->args.id); - return; - } - else - { - _message = itr->second; - } - } - - if(!_message.empty()) - { - component::category_region::stop(_message.data()); - } - - break; - } - case ROCTX_API_ID_roctxMarkA: - { - if(_data->args.message) - { - component::category_region::mark( - _data->args.message); - } - break; - } - default: break; - } -} - -// HIP API callback function -void -hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) -{ - if(get_state() != State::Active || !trait::runtime_enabled::get()) - return; - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - - assert(domain == ACTIVITY_DOMAIN_HIP_API); - const char* op_name = roctracer_op_string(domain, cid, 0); - if(op_name == nullptr) op_name = hip_api_name(cid); - if(op_name == nullptr) return; - assert(std::string{ op_name } == std::string{ hip_api_name(cid) }); - - switch(cid) - { - case HIP_API_ID___hipPushCallConfiguration: - case HIP_API_ID___hipPopCallConfiguration: - case HIP_API_ID_hipDeviceEnablePeerAccess: -#if ROCPROFSYS_HIP_VERSION_MAJOR > 4 || \ - (ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 3) - case HIP_API_ID_hipImportExternalMemory: - case HIP_API_ID_hipDestroyExternalMemory: -#endif - return; - default: break; - } - - const hip_api_data_t* data = reinterpret_cast(callback_data); - ROCPROFSYS_CONDITIONAL_PRINT_F( - get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n", - op_name, cid, data->correlation_id, - (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); - - int64_t _ts = comp::wall_clock::record(); - auto _tid = threading::get_id(); - uint64_t _crit_cid = 0; - uint64_t _parent_crit_cid = 0; - uint32_t _depth = 0; - auto _roct_cid = data->correlation_id; - - auto& _device_id = get_current_device(); - - if(data->phase == ACTIVITY_API_PHASE_ENTER) - { - if(cid == HIP_API_ID_hipSetDevice) - get_current_device() = - reinterpret_cast(data->args.hipSetDevice.deviceId) + 1; - - const char* _name = nullptr; - switch(cid) - { - case HIP_API_ID_hipLaunchKernel: - { - _name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address, - data->args.hipLaunchKernel.stream); - break; - } - case HIP_API_ID_hipLaunchCooperativeKernel: - { - _name = - hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f, - data->args.hipLaunchCooperativeKernel.stream); - if(!_name) - { - _name = - hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address, - data->args.hipLaunchKernel.stream); - } - break; - } - case HIP_API_ID_hipHccModuleLaunchKernel: - { - _name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f); - break; - } - case HIP_API_ID_hipModuleLaunchKernel: - { - _name = hipKernelNameRef(data->args.hipModuleLaunchKernel.f); - break; - } - case HIP_API_ID_hipExtModuleLaunchKernel: - { - _name = hipKernelNameRef(data->args.hipExtModuleLaunchKernel.f); - break; - } - case HIP_API_ID_hipExtLaunchKernel: - { - _name = - hipKernelNameRefByPtr(data->args.hipExtLaunchKernel.function_address, - data->args.hipLaunchKernel.stream); - break; - } - default: break; - } - - if(_name != nullptr) - { - if(get_use_perfetto() || get_use_timemory() || get_use_rocm_smi()) - { - locking::atomic_lock _lk{ roctracer_type_mutex() }; - get_roctracer_key_data().emplace(_roct_cid, _name); - get_roctracer_tid_data().emplace(_roct_cid, _tid); - } - } - - std::tie(_crit_cid, _parent_crit_cid, _depth) = create_cpu_cid_entry(); - - if(get_use_perfetto()) - { - static auto _compact_annotations = - config::get_setting_value( - "ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS") - .value_or(false); - - static auto _enable_backtraces = - config::get_setting_value("ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE") - .value_or(false); - - constexpr size_t bt_stack_depth = 16; - constexpr size_t bt_ignore_depth = 3; - constexpr bool bt_with_signal_frame = true; - - using backtrace_entry_vec_t = std::vector; - auto _bt_data = std::optional{}; - if(_enable_backtraces && config::get_perfetto_annotations()) - { - auto _backtrace = tim::get_unw_stack(); - _bt_data = backtrace_entry_vec_t{}; - _bt_data->reserve(_backtrace.size()); - for(auto itr : _backtrace) - { - if(itr) - { - if(auto _val = binary::lookup_ipaddr_entry(itr->address()); - _val) - { - _bt_data->emplace_back(std::move(*_val)); - } - } - } - } - - auto _api_id = static_cast(cid); - tracing::push_perfetto_ts( - category::rocm_hip{}, op_name, _ts, - ::perfetto::Flow::ProcessScoped(_roct_cid), - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", _ts); - tracing::add_perfetto_annotation(ctx, "cid", _crit_cid); - tracing::add_perfetto_annotation(ctx, "pcid", _parent_crit_cid); - tracing::add_perfetto_annotation(ctx, "device", _device_id); - tracing::add_perfetto_annotation(ctx, "tid", _tid); - tracing::add_perfetto_annotation(ctx, "depth", _depth); - tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid); - if(_compact_annotations) - { - tracing::add_perfetto_annotation( - ctx, "args", hip_api_string(_api_id, data)); - } - else - { - auto _args = std::string{ hip_api_string(_api_id, data) }; - if(!_args.empty()) - { - for(auto itr : tim::delimit(_args, ",")) - { - if(itr.empty()) continue; - auto _bpos = itr.find_first_not_of(' '); - auto _epos = itr.find_last_not_of(' '); - if(_epos > _bpos) - itr = itr.substr(_bpos, (_epos - _bpos) + 1); - auto _pos = itr.find('='); - if(_pos != std::string::npos) - tracing::add_perfetto_annotation( - ctx, itr.substr(0, _pos), - itr.substr(_pos + 1)); - } - } - } - - if(_enable_backtraces && _bt_data && !_bt_data->empty()) - { - const std::string _unk = "??"; - size_t _bt_cnt = 0; - for(const auto& itr : *_bt_data) - { - const auto* _func = - (itr.name.empty()) ? &_unk : &itr.name; - const auto* _loc = - (itr.location.empty()) ? &_unk : &itr.location; - auto _line = (itr.lineno == 0) ? std::string{ "?" } - : join("", itr.lineno); - auto _entry = join("", demangle(*_func), " @ ", - join(':', *_loc, _line)); - if(_bt_cnt < 10) - { - // Prepend zero for better ordering in UI. - // Only one zero is ever necessary since stack depth - // is limited to 16. - tracing::add_perfetto_annotation( - ctx, join("", "frame#0", _bt_cnt++), _entry); - } - else - { - tracing::add_perfetto_annotation( - ctx, join("", "frame#", _bt_cnt++), _entry); - } - } - } - } - }); - } - if(get_use_timemory()) - { - auto itr = get_roctracer_hip_data()->emplace( - _roct_cid, roctracer_hip_bundle_t{ op_name }); - if(itr.second) - { - itr.first->second.start(); - } - else if(itr.first != get_roctracer_hip_data()->end()) - { - itr.first->second.stop(); - get_roctracer_hip_data()->erase(itr.first); - } - } - - hip_exec_activity_callbacks(_tid); - } - else if(data->phase == ACTIVITY_API_PHASE_EXIT) - { - hip_exec_activity_callbacks(_tid); - - if(get_use_perfetto()) - { - tracing::pop_perfetto_ts( - category::rocm_hip{}, op_name, _ts, [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "end_ns", _ts); - } - }); - } - if(get_use_timemory()) - { - auto _stop = [&_roct_cid](int64_t _tid_v) { - auto& _data = get_roctracer_hip_data(_tid_v); - auto itr = _data->find(_roct_cid); - if(itr != get_roctracer_hip_data()->end()) - { - itr->second.stop(); - _data->erase(itr); - return true; - } - return false; - }; - if(!_stop(_tid)) - { - for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) - { - if(_stop(i)) break; - } - } - } - } - tim::consume_parameters(arg); -} - -// Activity tracing callback -void -hip_activity_callback(const char* begin, const char* end, void* arg) -{ - if(get_state() != State::Active || !trait::runtime_enabled::get()) - return; - - ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); - - auto&& _protect = comp::roctracer::protect_flush_activity(); - (void) _protect; - - if(!trait::runtime_enabled::get()) return; - static auto _kernel_names = std::unordered_map{}; - static auto _indexes = std::unordered_map{}; - static auto _skip_barrier_packets = - config::get_setting_value("ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS") - .value_or(false); - const roctracer_record_t* record = reinterpret_cast(begin); - const roctracer_record_t* end_record = - reinterpret_cast(end); - - auto&& _advance_record = [&record]() { - ROCPROFSYS_ROCTRACER_CALL(roctracer_next_record(record, &record)); - }; - - while(record < end_record) - { - // make sure every iteration advances regardless of where return point happens - scope::destructor _next_dtor{ _advance_record }; - - // ROCPROFSYS_CI will enable these asserts and should fail if something relevant - // changes - assert(HIP_OP_ID_DISPATCH == 0); - assert(HIP_OP_ID_COPY == 1); - assert(HIP_OP_ID_BARRIER == 2); - - if(record->domain == ACTIVITY_DOMAIN_HSA_OPS) - { - hsa_activity_callback(record->op, record, arg); - continue; - } - if(record->domain != ACTIVITY_DOMAIN_HIP_OPS) continue; - if(record->op > HIP_OP_ID_BARRIER) continue; - if(_skip_barrier_packets && record->op == HIP_OP_ID_BARRIER) continue; - - const char* op_name = - roctracer_op_string(record->domain, record->op, record->kind); - auto _ns_skew = get_clock_skew(); - uint64_t _beg_ns = record->begin_ns + _ns_skew; - uint64_t _end_ns = record->end_ns + _ns_skew; - auto _roct_cid = record->correlation_id; - - auto& _keys = get_roctracer_key_data(); - auto& _tids = get_roctracer_tid_data(); - - int64_t _tid = 0; // thread id - int32_t _devid = record->device_id; // device id - int64_t _queid = record->queue_id; // queue id - uintptr_t _queue = 0; // Host queue (stream) - const char* _name = nullptr; - bool _found = false; - - { - locking::atomic_lock _lk{ roctracer_type_mutex() }; - if(_tids.find(_roct_cid) != _tids.end()) - { - _found = true; - _tid = _tids.at(_roct_cid); - auto itr = _keys.find(_roct_cid); - if(itr != _keys.end()) _name = itr->second; - } - } - - if(_name == nullptr && op_name == nullptr) continue; - if(_name == nullptr) _name = op_name; - - static auto _op_id_names = - std::array{ "DISPATCH", "COPY", "BARRIER" }; - - if(_end_ns < _beg_ns) - { - auto _verbose = []() { return get_verbose() >= 0 || get_debug(); }; - static size_t _n = 0; - static size_t _nmax = - get_env("ROCPROFSYS_ROCTRACER_DISCARD_INVALID", 0); - if(_nmax == 0) std::swap(_end_ns, _beg_ns); - ROCPROFSYS_WARNING_IF_F( - _n < _nmax && _verbose(), - "%4zu :: Discarding kernel roctracer activity record which ended before " - "it started :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) " - "delta=%li, device=%d, queue=%lu, pid=%u, tid=%lu, op=%s\n", - _n, op_name, _name, record->correlation_id, _beg_ns, _end_ns, - (static_cast(_end_ns) - static_cast(_beg_ns)), _devid, - _queid, record->process_id, _tid, _op_id_names.at(record->op)); - ROCPROFSYS_WARNING_IF_F( - _nmax > 0 && _n == _nmax && _verbose(), - "Suppressing future messages about discarding kernel roctracer activity " - "record which ended before it started. Set " - "ROCPROFSYS_ROCTRACER_DISCARD_INVALID=N to increase/decrease the number " - "of messages. If N is set to 0, data will be included after swapping the " - "begin and end values\n"); - if(_end_ns < _beg_ns) - { - ++_n; - continue; - } - } - - // execute this on this thread bc of how perfetto visualization works - if(get_use_perfetto()) - { - if(_kernel_names.find(_name) == _kernel_names.end()) - _kernel_names.emplace(_name, tim::demangle(_name)); - - auto _track_desc = [](int32_t _device_id, int64_t _queue_id) { - if(config::get_perfetto_roctracer_per_stream()) - return JOIN("", "HIP Activity Device ", _device_id, ", Queue ", - _queue_id); - return JOIN("", "HIP Activity Device ", _device_id); - }; - - const auto _track = tracing::get_perfetto_track( - category::device_hip{}, _track_desc, _devid, - (get_perfetto_roctracer_per_stream()) ? _queid : 0); - - assert(_end_ns >= _beg_ns); - tracing::push_perfetto_track( - category::device_hip{}, _kernel_names.at(_name).c_str(), _track, _beg_ns, - ::perfetto::Flow::ProcessScoped(_roct_cid), - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); - tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); - tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid); - tracing::add_perfetto_annotation(ctx, "device", _devid); - tracing::add_perfetto_annotation(ctx, "queue", _queid); - tracing::add_perfetto_annotation(ctx, "tid", _tid); - tracing::add_perfetto_annotation( - ctx, "stream", JOIN("", "0x", std::hex, _queue)); - tracing::add_perfetto_annotation(ctx, "op", - _op_id_names.at(record->op)); - } - }); - tracing::pop_perfetto_track(category::device_hip{}, "", _track, _end_ns); - } - - if(_found && _name != nullptr && get_use_timemory()) - { - auto _func = [_beg_ns, _end_ns, _name]() { - roctracer_hip_bundle_t _bundle{ _name }; - _bundle.start() - .store(std::plus{}, static_cast(_end_ns - _beg_ns)) - .stop() - .get([&](comp::wall_clock* wc) { - wc->set_value(_end_ns - _beg_ns); - wc->set_accum(_end_ns - _beg_ns); - return wc; - }); - _bundle.pop(); - }; - - auto& _async_ops = get_hip_activity_callbacks(_tid); - locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) }; - _async_ops->emplace_back(std::move(_func)); - } - } - - // ensures that all the updates are written - if(get_use_perfetto()) ::perfetto::TrackEvent::Flush(); -} - -bool& -roctracer_is_init() -{ - static bool _v = tim::get_env("ROCPROFSYS_ROCTRACER_IS_INIT", false); - return _v; -} - -bool& -roctracer_is_setup() -{ - static bool _v = false; - return _v; -} - -using roctracer_functions_t = std::vector>>; - -roctracer_functions_t& -roctracer_setup_routines() -{ - static auto _v = roctracer_functions_t{}; - return _v; -} - -roctracer_functions_t& -roctracer_shutdown_routines() -{ - static auto _v = roctracer_functions_t{}; - return _v; -} -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.hpp deleted file mode 100644 index e0f0a4a163..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/roctracer.hpp +++ /dev/null @@ -1,89 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "core/config.hpp" -#include "core/debug.hpp" -#include "core/hip_runtime.hpp" -#include "core/perfetto.hpp" -#include "library/components/roctracer.hpp" -#include "library/ptl.hpp" - -#include -#include - -// Macro to check ROC-tracer calls status -#define ROCPROFSYS_ROCTRACER_CALL(call) \ - { \ - ROCPROFSYS_DEBUG_F(#call); \ - int err = call; \ - if(err != 0) \ - { \ - ROCPROFSYS_PRINT_F("%s in: %s\n", roctracer_error_string(), #call); \ - } \ - } - -namespace rocprofsys -{ -using roctracer_hip_bundle_t = - tim::component_bundle; -using roctracer_hsa_bundle_t = - tim::component_bundle; -using roctracer_functions_t = std::vector>>; - -// HSA API callback function -void -hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); - -void -hsa_activity_callback(uint32_t op, const void* record, void* arg); - -void -hip_exec_activity_callbacks(int64_t _tid); - -// HIP API callback function -void -hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); - -void -roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); - -// Activity tracing callback -void -hip_activity_callback(const char* begin, const char* end, void*); - -bool& -roctracer_is_init(); - -bool& -roctracer_is_setup(); - -int64_t -get_clock_skew(); - -roctracer_functions_t& -roctracer_setup_routines(); - -roctracer_functions_t& -roctracer_shutdown_routines(); -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.hpp index 7d64c326e3..5a8ea562eb 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.hpp @@ -33,7 +33,6 @@ #include "library/components/mpi_gotcha.hpp" #include "library/components/numa_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/roctracer.hpp" #include "library/thread_data.hpp" #include diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-rocm-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-rocm-tests.cmake index feed0f1bea..50d4ef9584 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-rocm-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-rocm-tests.cmake @@ -4,9 +4,7 @@ # # -------------------------------------------------------------------------------------- # -set(ROCPROFSYS_ROCM_EVENTS_TEST - "GRBM_COUNT,GPUBusy,SQ_WAVES,SQ_INSTS_VALU,VALUInsts,TCC_HIT_sum,TA_TA_BUSY[0]:device=0,TA_TA_BUSY[11]:device=0" - ) +set(ROCPROFSYS_ROCM_EVENTS_TEST "GRBM_COUNT,SQ_WAVES,SQ_INSTS_VALU,TA_TA_BUSY:device=0") rocprofiler_systems_add_test( NAME transpose @@ -26,7 +24,8 @@ rocprofiler_systems_add_test( args -E uniform_int_distribution - ENVIRONMENT "${_base_environment}") + ENVIRONMENT "${_base_environment}" + RUNTIME_TIMEOUT 480) rocprofiler_systems_add_test( SKIP_REWRITE SKIP_RUNTIME @@ -36,9 +35,7 @@ rocprofiler_systems_add_test( GPU ON NUM_PROCS 1 RUN_ARGS 1 2 2 - ENVIRONMENT - "${_base_environment};ROCPROFSYS_ROCTRACER_HSA_ACTIVITY=OFF;ROCPROFSYS_ROCTRACER_HSA_API=OFF" - ) + ENVIRONMENT "${_base_environment}") rocprofiler_systems_add_test( SKIP_BASELINE SKIP_RUNTIME @@ -64,7 +61,11 @@ rocprofiler_systems_add_test( ENVIRONMENT "${_base_environment}" REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose") -if(ROCPROFSYS_USE_ROCPROFILER) +if(ROCPROFSYS_USE_ROCM) + set(_ROCP_PASS_REGEX + "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-TA_TA_BUSY.txt(.*)" + ) + rocprofiler_systems_add_test( SKIP_BASELINE SKIP_RUNTIME NAME transpose-rocprofiler @@ -76,22 +77,7 @@ if(ROCPROFSYS_USE_ROCPROFILER) REWRITE_ARGS -e -v 2 -E uniform_int_distribution ENVIRONMENT "${_base_environment};ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}" - REWRITE_RUN_PASS_REGEX - "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" - ) + REWRITE_RUN_PASS_REGEX "${_ROCP_PASS_REGEX}" + SAMPLING_PASS_REGEX "${_ROCP_PASS_REGEX}") - rocprofiler_systems_add_test( - SKIP_BASELINE SKIP_RUNTIME - NAME transpose-rocprofiler-no-roctracer - TARGET transpose - LABELS "rocprofiler" - MPI ${TRANSPOSE_USE_MPI} - GPU ON - NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS -e -v 2 -E uniform_int_distribution - ENVIRONMENT - "${_base_environment};ROCPROFSYS_USE_ROCTRACER=OFF;ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}" - REWRITE_RUN_PASS_REGEX - "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" - REWRITE_RUN_FAIL_REGEX "roctracer.txt|ROCPROFSYS_ABORT_FAIL_REGEX") endif() diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake index 5b92f55df6..fafafec9dc 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake @@ -226,7 +226,7 @@ endif() # -------------------------------------------------------------------------------------- # set(_VALID_GPU OFF) -if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU)) +if(ROCPROFSYS_USE_ROCM AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU)) set(_VALID_GPU ON) find_program( ROCPROFSYS_ROCM_SMI_EXE @@ -254,7 +254,7 @@ if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU)) endif() endif() -set(LULESH_USE_GPU ${LULESH_USE_HIP}) +set(LULESH_USE_GPU ${LULESH_USE_ROCM}) if(LULESH_USE_CUDA) set(LULESH_USE_GPU ON) endif() @@ -314,8 +314,6 @@ ROCPROFSYS_SAMPLING_FREQ = 300 ROCPROFSYS_SAMPLING_DELAY = 0.05 ROCPROFSYS_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS} ROCPROFSYS_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES -ROCPROFSYS_ROCTRACER_HSA_API = ON -ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = ON # test-specific values ${_FILE_CONTENTS} @@ -430,18 +428,18 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) if(TEST_GPU) list(APPEND TEST_LABELS "gpu") - if(NOT "ROCPROFSYS_USE_ROCTRACER=OFF" IN_LIST TEST_ENVIRONMENT) - list(APPEND TEST_LABELS "roctracer") + if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "rocm") endif() - if(NOT "ROCPROFSYS_USE_ROCM_SMI=OFF" IN_LIST TEST_ENVIRONMENT) + if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT) list(APPEND TEST_LABELS "rocm-smi") endif() endif() - if("ROCPROFSYS_USE_ROCTRACER=ON" IN_LIST TEST_ENVIRONMENT AND NOT "roctracer" IN_LIST - TEST_ENVIRONMENT) - list(APPEND TEST_LABELS "roctracer") + if("ROCPROFSYS_USE_ROCM=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm" IN_LIST + TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "rocm") endif() if("ROCPROFSYS_USE_ROCM_SMI=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm-smi" IN_LIST @@ -449,11 +447,6 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) list(APPEND TEST_LABELS "rocm-smi") endif() - if("ROCPROFSYS_USE_ROCPROFILER=ON" IN_LIST TEST_ENVIRONMENT - AND NOT "rocprofiler" IN_LIST TEST_ENVIRONMENT) - list(APPEND TEST_LABELS "rocprofiler") - endif() - if(TARGET ${TEST_TARGET}) if(DEFINED TEST_MPI AND ${TEST_MPI}