Update to use rocprofiler-sdk (#55)
- Renames the CMake option "ROCPROFSYS_USE_HIP" to "ROCPROFSYS_USE_ROCM"
- Remove the "ROCPROFSYS_USE_ROCM_SMI option. Controlled with the "ROCPROFSYS_USE_ROCM" option, instead.
- Runtime configuration can still toggle ROCPROFSYS_USE_ROCM_SMI to disable the sampling.
- Rename ROCPROFSYS_HIP_VERSION macro to ROCPROFSYS_ROCM_VERSION and remove blocks for `ROCPROFSYS_ROCM_VERSION < 60000`
- Remove ROCPROFSYS_USE_ROCTRACER and ROCPROFSYS_USE_ROCPROFILER
- Update test cases
- Update docker files and workflows to install cmake 3.21, which is required for the rocprofiler-sdk findPackage script.
- Removed rocm-6.2 from workflows due to a rocprofiler-sdk API change.
[ROCm/rocprofiler-systems commit: 88aa2d3cbe]
このコミットが含まれているのは:
+1
-39
@@ -39,12 +39,10 @@ jobs:
|
||||
version: "15.5"
|
||||
- distro: "opensuse"
|
||||
version: "15.6"
|
||||
- distro: "rhel"
|
||||
version: "8.8"
|
||||
- distro: "rhel"
|
||||
version: "8.10"
|
||||
- distro: "rhel"
|
||||
version: "9.2"
|
||||
version: "9.3"
|
||||
- distro: "rhel"
|
||||
version: "9.4"
|
||||
|
||||
@@ -90,9 +88,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -100,9 +95,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -110,9 +102,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -120,9 +109,6 @@ jobs:
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "6.3"
|
||||
@@ -130,43 +116,19 @@ jobs:
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "6.3"
|
||||
# RHEL 8.9
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.9"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.9"
|
||||
rocm-version: "6.2"
|
||||
# RHEL 8.10
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "6.3"
|
||||
# RHEL 9.3
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.3"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.3"
|
||||
rocm-version: "6.2"
|
||||
# RHEL 9.4
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "6.3"
|
||||
|
||||
@@ -37,9 +37,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "20.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -47,9 +44,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "22.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -57,9 +51,6 @@ jobs:
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "ubuntu"
|
||||
os-version: "24.04"
|
||||
rocm-version: "6.3"
|
||||
@@ -67,9 +58,6 @@ jobs:
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.5"
|
||||
rocm-version: "6.3"
|
||||
@@ -77,43 +65,20 @@ jobs:
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "opensuse"
|
||||
os-version: "15.6"
|
||||
rocm-version: "6.3"
|
||||
# RHEL 8.9
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.9"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.9"
|
||||
rocm-version: "6.2"
|
||||
# RHEL 8.10
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "rhel"
|
||||
os-version: "8.10"
|
||||
rocm-version: "6.3"
|
||||
# RHEL 9.3
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.3"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.3"
|
||||
rocm-version: "6.2"
|
||||
# RHEL 9.4
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "0.0"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "6.2"
|
||||
- os-distro: "rhel"
|
||||
os-version: "9.4"
|
||||
rocm-version: "6.3"
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
fi
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Configure Env
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_USE_MPI=OFF
|
||||
-DROCPROFSYS_USE_HIP=OFF
|
||||
-DROCPROFSYS_USE_ROCM=OFF
|
||||
-DROCPROFSYS_USE_OMPT=OFF
|
||||
-DROCPROFSYS_USE_PYTHON=ON
|
||||
-DROCPROFSYS_INSTALL_PERFETTO_TOOLS=OFF
|
||||
|
||||
+4
-4
@@ -46,8 +46,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
compiler: ['g++']
|
||||
os-release: [ '8.10', '9.2', '9.4' ]
|
||||
rocm-version: [ '0.0', '6.2', '6.3' ]
|
||||
os-release: [ '8.10', '9.3', '9.4' ]
|
||||
rocm-version: [ '0.0', '6.3' ]
|
||||
build-type: ['Release']
|
||||
|
||||
steps:
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
fi
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install ROCm Packages
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_USE_MPI=OFF
|
||||
-DROCPROFSYS_USE_HIP=${USE_HIP}
|
||||
-DROCPROFSYS_USE_ROCM=${USE_HIP}
|
||||
-DROCPROFSYS_USE_OMPT=OFF
|
||||
-DROCPROFSYS_USE_PYTHON=ON
|
||||
-DROCPROFSYS_USE_MPI_HEADERS=ON
|
||||
|
||||
+10
-16
@@ -100,7 +100,7 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
@@ -145,7 +145,7 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_USE_MPI=OFF
|
||||
-DROCPROFSYS_USE_HIP=OFF
|
||||
-DROCPROFSYS_USE_ROCM=OFF
|
||||
-DROCPROFSYS_USE_OMPT=OFF
|
||||
-DROCPROFSYS_USE_PAPI=OFF
|
||||
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
|
||||
@@ -245,16 +245,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
compiler: ['g++']
|
||||
rocm-version: ['6.2']
|
||||
rocm-version: ['6.3']
|
||||
mpi-headers: ['OFF']
|
||||
build-jobs: ['3']
|
||||
ctest-exclude: ['-LE "mpi-example|transpose"']
|
||||
include:
|
||||
- compiler: 'g++'
|
||||
rocm-version: 'latest'
|
||||
mpi-headers: 'ON'
|
||||
build-jobs: '2'
|
||||
ctest-exclude: '-LE transpose'
|
||||
ctest-exclude: ['-LE "transpose"']
|
||||
|
||||
env:
|
||||
BUILD_TYPE: MinSizeRel
|
||||
@@ -282,7 +276,7 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
@@ -336,7 +330,7 @@ jobs:
|
||||
-DROCPROFSYS_BUILD_EXTRA_OPTIMIZATIONS=OFF
|
||||
-DROCPROFSYS_BUILD_LTO=OFF
|
||||
-DROCPROFSYS_USE_MPI=OFF
|
||||
-DROCPROFSYS_USE_HIP=ON
|
||||
-DROCPROFSYS_USE_ROCM=ON
|
||||
-DROCPROFSYS_MAX_THREADS=64
|
||||
-DROCPROFSYS_USE_PAPI=OFF
|
||||
-DROCPROFSYS_USE_OMPT=OFF
|
||||
@@ -440,7 +434,7 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
sudo apt-get -y --purge autoremove &&
|
||||
sudo apt-get -y clean
|
||||
|
||||
@@ -477,7 +471,7 @@ jobs:
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_BUILD_DYNINST=ON
|
||||
-DROCPROFSYS_USE_MPI=${USE_MPI}
|
||||
-DROCPROFSYS_USE_HIP=OFF
|
||||
-DROCPROFSYS_USE_ROCM=OFF
|
||||
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
|
||||
-DROCPROFSYS_USE_OMPT=${{ matrix.ompt }}
|
||||
-DROCPROFSYS_USE_PAPI=${{ matrix.papi }}
|
||||
@@ -593,7 +587,7 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
@@ -625,7 +619,7 @@ jobs:
|
||||
-DROCPROFSYS_USE_PYTHON=ON
|
||||
-DROCPROFSYS_USE_OMPT=ON
|
||||
-DROCPROFSYS_USE_PAPI=ON
|
||||
-DROCPROFSYS_USE_HIP=OFF
|
||||
-DROCPROFSYS_USE_ROCM=OFF
|
||||
-DROCPROFSYS_USE_RCCL=OFF
|
||||
-DROCPROFSYS_MAX_THREADS=64
|
||||
-DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl"
|
||||
|
||||
@@ -75,22 +75,7 @@ jobs:
|
||||
static-libgcc: 'OFF'
|
||||
static-libstdcxx: 'OFF'
|
||||
build-dyninst: 'OFF'
|
||||
rocm-version: '6.2'
|
||||
- compiler: 'g++'
|
||||
hip: 'ON'
|
||||
mpi: 'OFF'
|
||||
ompt: 'OFF'
|
||||
papi: 'OFF'
|
||||
python: 'ON'
|
||||
lto: 'OFF'
|
||||
strip: 'OFF'
|
||||
hidden: 'ON'
|
||||
build-type: 'Release'
|
||||
mpi-headers: 'OFF'
|
||||
static-libgcc: 'OFF'
|
||||
static-libstdcxx: 'OFF'
|
||||
build-dyninst: 'OFF'
|
||||
rocm-version: 'latest'
|
||||
rocm-version: '6.3'
|
||||
|
||||
env:
|
||||
OMPI_ALLOW_RUN_AS_ROOT: 1
|
||||
@@ -116,7 +101,7 @@ jobs:
|
||||
openmpi-bin python3-pip texinfo ${{ matrix.compiler }} &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
python3 -m pip install 'cmake==3.21' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install ROCm Packages
|
||||
@@ -183,7 +168,7 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems-dev
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_USE_MPI=${{ matrix.mpi }}
|
||||
-DROCPROFSYS_USE_HIP=${{ matrix.hip }}
|
||||
-DROCPROFSYS_USE_ROCM=${{ matrix.hip }}
|
||||
-DROCPROFSYS_USE_OMPT=${{ matrix.ompt }}
|
||||
-DROCPROFSYS_USE_PAPI=${{ matrix.papi }}
|
||||
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
|
||||
|
||||
@@ -101,7 +101,7 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \
|
||||
-DROCPROFSYS_BUILD_TESTING=ON \
|
||||
-DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl" \
|
||||
-DROCPROFSYS_USE_HIP=${USE_ROCM} \
|
||||
-DROCPROFSYS_USE_ROCM=${USE_ROCM} \
|
||||
-DRCOPROFSYS_USE_PYTHON=ON \
|
||||
-DROCPROFSYS_STRIP_LIBRARIES=${{ matrix.strip }} \
|
||||
-DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs \
|
||||
|
||||
@@ -176,18 +176,11 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_CLANG_TIDY "Enable clang-tidy" OFF
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_BFD
|
||||
"Enable BFD support (map call-stack samples to LOC)" ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_HIP "Enable HIP support" ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI"
|
||||
ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCTRACER "Enable roctracer support"
|
||||
${ROCPROFSYS_USE_HIP})
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCPROFILER "Enable rocprofiler support"
|
||||
${ROCPROFSYS_USE_HIP})
|
||||
rocprofiler_systems_add_option(
|
||||
ROCPROFSYS_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
|
||||
${ROCPROFSYS_USE_HIP})
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support"
|
||||
${ROCPROFSYS_USE_HIP})
|
||||
${ROCPROFSYS_USE_ROCM})
|
||||
rocprofiler_systems_add_option(
|
||||
ROCPROFSYS_USE_MPI_HEADERS
|
||||
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
|
||||
@@ -217,30 +210,10 @@ elseif("$ENV{ROCPROFSYS_CI}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT ROCPROFSYS_USE_HIP)
|
||||
set(ROCPROFSYS_USE_ROCTRACER
|
||||
OFF
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
|
||||
set(ROCPROFSYS_USE_ROCPROFILER
|
||||
OFF
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
|
||||
set(ROCPROFSYS_USE_ROCM_SMI
|
||||
OFF
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
|
||||
if(NOT ROCPROFSYS_USE_ROCM)
|
||||
set(ROCPROFSYS_USE_RCCL
|
||||
OFF
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
|
||||
elseif(
|
||||
ROCPROFSYS_USE_HIP
|
||||
AND NOT ROCPROFSYS_USE_ROCTRACER
|
||||
AND NOT ROCPROFSYS_USE_ROCPROFILER
|
||||
AND NOT ROCPROFSYS_USE_ROCM_SMI
|
||||
AND NOT ROCPROFSYS_USE_RCCL)
|
||||
rocprofiler_systems_message(
|
||||
AUTHOR_WARNING
|
||||
"Setting ROCPROFSYS_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
|
||||
)
|
||||
set(ROCPROFSYS_USE_HIP OFF)
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_BUILD_TESTING)
|
||||
@@ -378,14 +351,6 @@ endif()
|
||||
#
|
||||
# ------------------------------------------------------------------------------#
|
||||
|
||||
if(NOT ROCPROFSYS_USE_ROCTRACER AND NOT ROCPROFSYS_USE_ROCPROFILER)
|
||||
set(ROCPROFSYS_HSA_ENV "# ")
|
||||
endif()
|
||||
|
||||
if(NOT ROCPROFSYS_USE_ROCPROFILER)
|
||||
set(ROCPROFSYS_ROCP_ENV "# ")
|
||||
endif()
|
||||
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/LICENSE
|
||||
${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/doc/${PROJECT_NAME}/LICENSE
|
||||
|
||||
@@ -54,9 +54,7 @@ set(ROCPROFSYS_CPACK_SYSTEM_NAME
|
||||
CACHE STRING "System name, e.g. Linux or Ubuntu-20.04")
|
||||
set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX "")
|
||||
|
||||
if(ROCPROFSYS_USE_HIP
|
||||
OR ROCPROFSYS_USE_ROCTRACER
|
||||
OR ROCPROFSYS_USE_ROCM_SMI)
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX
|
||||
"${ROCPROFSYS_CPACK_PACKAGE_SUFFIX}-ROCm-${ROCmVersion_NUMERIC_VERSION}")
|
||||
endif()
|
||||
@@ -159,19 +157,12 @@ if(NOT ROCPROFSYS_BUILD_DYNINST)
|
||||
endif()
|
||||
endif()
|
||||
if(ROCmVersion_FOUND)
|
||||
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
set(_ROCM_SMI_SUFFIX
|
||||
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
endif()
|
||||
if(ROCPROFSYS_USE_ROCM_SMI)
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib${_ROCM_SMI_SUFFIX}")
|
||||
endif()
|
||||
if(ROCPROFSYS_USE_ROCTRACER)
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
|
||||
endif()
|
||||
if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-sdk (>= ${rocprofiler-sdk_VERSION})")
|
||||
endif()
|
||||
if(ROCPROFSYS_USE_MPI)
|
||||
if("${ROCPROFSYS_MPI_IMPL}" STREQUAL "openmpi")
|
||||
|
||||
@@ -109,13 +109,6 @@ set(_ROCPROFSYS_PAPI_COMPONENTS
|
||||
)
|
||||
|
||||
if(ROCPROFSYS_PAPI_AUTO_COMPONENTS)
|
||||
# rocm
|
||||
if(ROCPROFSYS_USE_HIP
|
||||
OR ROCPROFSYS_USE_ROCTRACER
|
||||
OR ROCPROFSYS_USE_ROCM_SMI)
|
||||
list(APPEND _ROCPROFSYS_PAPI_COMPONENTS rocm)
|
||||
endif()
|
||||
|
||||
# lmsensors
|
||||
find_path(ROCPROFSYS_PAPI_LMSENSORS_ROOT_DIR NAMES include/sensors/sensors.h
|
||||
include/sensors.h)
|
||||
@@ -209,28 +202,35 @@ externalproject_add(
|
||||
BUILD_IN_SOURCE 1
|
||||
PATCH_COMMAND
|
||||
${CMAKE_COMMAND} -E env CC=${PAPI_C_COMPILER}
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation LIBS=-lrt LDFLAGS=-lrt
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} <SOURCE_DIR>/configure --quiet
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free LIBS=-lrt
|
||||
LDFLAGS=-lrt ${ROCPROFSYS_PAPI_EXTRA_ENV} <SOURCE_DIR>/configure --quiet
|
||||
--prefix=${ROCPROFSYS_PAPI_INSTALL_DIR} --with-static-lib=yes --with-shared-lib=no
|
||||
--with-perf-events --with-tests=no
|
||||
--with-components=${_ROCPROFSYS_PAPI_COMPONENTS}
|
||||
--libdir=${ROCPROFSYS_PAPI_INSTALL_DIR}/lib
|
||||
CONFIGURE_COMMAND
|
||||
${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
|
||||
${CMAKE_COMMAND} -E env
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s -j
|
||||
${ROCPROFSYS_PAPI_CONFIGURE_JOBS}
|
||||
BUILD_COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
|
||||
BUILD_COMMAND
|
||||
${CMAKE_COMMAND} -E env
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
|
||||
INSTALL_COMMAND ""
|
||||
BUILD_BYPRODUCTS "${_ROCPROFSYS_PAPI_BUILD_BYPRODUCTS}")
|
||||
|
||||
# target for re-executing the installation
|
||||
add_custom_target(
|
||||
rocprofiler-systems-papi-install
|
||||
COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s
|
||||
COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
|
||||
COMMAND
|
||||
${CMAKE_COMMAND} -E env
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s
|
||||
COMMAND
|
||||
${CMAKE_COMMAND} -E env
|
||||
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
|
||||
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
|
||||
WORKING_DIRECTORY ${ROCPROFSYS_PAPI_SOURCE_DIR}/src
|
||||
COMMENT "Installing PAPI...")
|
||||
|
||||
|
||||
@@ -15,14 +15,12 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-threading
|
||||
rocprofiler_systems_add_interface_library(
|
||||
rocprofiler-systems-dyninst
|
||||
"Provides flags and libraries for Dyninst (dynamic instrumentation)")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-hip
|
||||
"Provides flags and libraries for HIP")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm
|
||||
"Provides flags and libraries for ROCm")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-roctracer
|
||||
"Provides flags and libraries for roctracer")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocprofiler
|
||||
"Provides flags and libraries for rocprofiler")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm-smi
|
||||
"Provides flags and libraries for rocm-smi")
|
||||
rocprofiler_systems_add_interface_library(
|
||||
rocprofiler-systems-rccl
|
||||
"Provides flags for ROCm Communication Collectives Library (RCCL)")
|
||||
@@ -50,10 +48,7 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition
|
||||
|
||||
# libraries with relevant compile definitions
|
||||
set(ROCPROFSYS_EXTENSION_LIBRARIES
|
||||
rocprofiler-systems::rocprofiler-systems-hip
|
||||
rocprofiler-systems::rocprofiler-systems-roctracer
|
||||
rocprofiler-systems::rocprofiler-systems-rocprofiler
|
||||
rocprofiler-systems::rocprofiler-systems-rocm-smi
|
||||
rocprofiler-systems::rocprofiler-systems-rocm
|
||||
rocprofiler-systems::rocprofiler-systems-rccl
|
||||
rocprofiler-systems::rocprofiler-systems-bfd
|
||||
rocprofiler-systems::rocprofiler-systems-mpi
|
||||
@@ -127,14 +122,11 @@ endforeach()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# hip version
|
||||
# ROCm Version
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(ROCPROFSYS_USE_HIP
|
||||
OR ROCPROFSYS_USE_ROCTRACER
|
||||
OR ROCPROFSYS_USE_ROCPROFILER
|
||||
OR ROCPROFSYS_USE_ROCM_SMI)
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
find_package(ROCmVersion)
|
||||
|
||||
if(NOT ROCmVersion_FOUND)
|
||||
@@ -164,13 +156,13 @@ if(ROCPROFSYS_USE_HIP
|
||||
endif()
|
||||
|
||||
set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_FULL_VERSION})
|
||||
set(ROCPROFSYS_HIP_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION})
|
||||
set(ROCPROFSYS_HIP_VERSION_MINOR ${ROCmVersion_MINOR_VERSION})
|
||||
set(ROCPROFSYS_HIP_VERSION_PATCH ${ROCmVersion_PATCH_VERSION})
|
||||
set(ROCPROFSYS_HIP_VERSION ${ROCmVersion_TRIPLE_VERSION})
|
||||
set(ROCPROFSYS_ROCM_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION})
|
||||
set(ROCPROFSYS_ROCM_VERSION_MINOR ${ROCmVersion_MINOR_VERSION})
|
||||
set(ROCPROFSYS_ROCM_VERSION_PATCH ${ROCmVersion_PATCH_VERSION})
|
||||
set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_TRIPLE_VERSION})
|
||||
|
||||
if(ROCPROFSYS_HIP_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_HIP_VERSION_MINOR
|
||||
GREATER 3)
|
||||
if(ROCPROFSYS_ROCM_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_ROCM_VERSION_MINOR
|
||||
GREATER 3)
|
||||
set(roctracer_kfdwrapper_LIBRARY)
|
||||
endif()
|
||||
|
||||
@@ -181,64 +173,30 @@ if(ROCPROFSYS_USE_HIP
|
||||
rocprofiler_systems_add_feature(ROCPROFSYS_ROCM_VERSION
|
||||
"ROCm version used by rocprofiler-systems")
|
||||
else()
|
||||
set(ROCPROFSYS_HIP_VERSION "0.0.0")
|
||||
set(ROCPROFSYS_HIP_VERSION_MAJOR 0)
|
||||
set(ROCPROFSYS_HIP_VERSION_MINOR 0)
|
||||
set(ROCPROFSYS_HIP_VERSION_PATCH 0)
|
||||
set(ROCPROFSYS_ROCM_VERSION "0.0.0")
|
||||
set(ROCPROFSYS_ROCM_VERSION_MAJOR 0)
|
||||
set(ROCPROFSYS_ROCM_VERSION_MINOR 0)
|
||||
set(ROCPROFSYS_ROCM_VERSION_PATCH 0)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# HIP
|
||||
# ROCm
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(ROCPROFSYS_USE_HIP)
|
||||
find_package(hip ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-hip
|
||||
INTERFACE ROCPROFSYS_USE_HIP)
|
||||
target_link_libraries(rocprofiler-systems-hip INTERFACE hip::host)
|
||||
endif()
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
find_package(rocprofiler-sdk ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm
|
||||
INTERFACE ROCPROFSYS_USE_ROCM)
|
||||
target_link_libraries(rocprofiler-systems-rocm
|
||||
INTERFACE rocprofiler-sdk::rocprofiler-sdk)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# roctracer
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(ROCPROFSYS_USE_ROCTRACER)
|
||||
find_package(roctracer ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-roctracer
|
||||
INTERFACE ROCPROFSYS_USE_ROCTRACER)
|
||||
target_link_libraries(
|
||||
rocprofiler-systems-roctracer
|
||||
INTERFACE roctracer::roctracer rocprofiler-systems::rocprofiler-systems-hip)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# rocprofiler
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
find_package(rocprofiler ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocprofiler
|
||||
INTERFACE ROCPROFSYS_USE_ROCPROFILER)
|
||||
target_link_libraries(rocprofiler-systems-rocprofiler
|
||||
INTERFACE rocprofiler::rocprofiler)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# rocm-smi
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(ROCPROFSYS_USE_ROCM_SMI)
|
||||
find_package(rocm-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm-smi
|
||||
INTERFACE ROCPROFSYS_USE_ROCM_SMI)
|
||||
target_link_libraries(rocprofiler-systems-rocm-smi INTERFACE rocm-smi::rocm-smi)
|
||||
target_link_libraries(rocprofiler-systems-rocm INTERFACE rocm-smi::rocm-smi)
|
||||
|
||||
# find_package(amd-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
# target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
@@ -14,7 +14,3 @@ prepend-path PATH "${ROOT}/bin"
|
||||
prepend-path LD_LIBRARY_PATH "${ROOT}/@CMAKE_INSTALL_LIBDIR@"
|
||||
prepend-path PYTHONPATH "${ROOT}/@CMAKE_INSTALL_PYTHONDIR@"
|
||||
setenv @PROJECT_NAME_UNDERSCORED@_DIR "${ROOT}/@CMAKE_INSTALL_DATAROOTDIR@/cmake/@PROJECT_NAME@"
|
||||
|
||||
# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
|
||||
# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_REPORT_LOAD_FAILURE 1
|
||||
# @ROCPROFSYS_ROCP_ENV@setenv ROCP_TOOL_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
|
||||
|
||||
@@ -26,12 +26,3 @@ export LD_LIBRARY_PATH
|
||||
export PYTHONPATH
|
||||
export CMAKE_PREFIX_PATH
|
||||
export @PROJECT_NAME_UNDERSCORED@_DIR
|
||||
|
||||
# ROCm environment variables
|
||||
# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys-dl@CMAKE_SHARED_LIBRARY_SUFFIX@"
|
||||
# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_REPORT_LOAD_FAILURE=1
|
||||
# @ROCPROFSYS_ROCP_ENV@ROCP_TOOL_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
|
||||
|
||||
# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_LIB
|
||||
# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_REPORT_LOAD_FAILURE
|
||||
# @ROCPROFSYS_ROCP_ENV@export ROCP_TOOL_LIB
|
||||
|
||||
@@ -25,7 +25,7 @@ RUN zypper --non-interactive update -y && \
|
||||
zypper --non-interactive install -y -t pattern devel_basis && \
|
||||
zypper --non-interactive install -y binutils-gold cmake curl dpkg-devel \
|
||||
gcc-c++ git libnuma-devel openmpi3-devel python3-pip rpm-build wget && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
python3 -m pip install 'cmake==3.21'
|
||||
|
||||
ARG ROCM_VERSION=0.0
|
||||
ARG AMDGPU_RPM=6.2/sle/15.6/amdgpu-install-6.2.60200-1.noarch.rpm
|
||||
|
||||
@@ -31,7 +31,7 @@ RUN zypper --non-interactive update -y && \
|
||||
gcc-c++ git libnuma-devel openmpi3-devel papi-devel python3-pip \
|
||||
rpm-build wget && \
|
||||
zypper --non-interactive clean --all && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
python3 -m pip install 'cmake==3.21'
|
||||
|
||||
COPY ./dyninst-source /tmp/dyninst
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ RUN yum groupinstall -y "Development Tools" && \
|
||||
yum install -y --allowerasing cmake curl dpkg-devel numactl-devel openmpi-devel \
|
||||
papi-devel python3-pip texinfo wget which zlib-devel && \
|
||||
yum clean all && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
python3 -m pip install 'cmake==3.21'
|
||||
|
||||
ARG ROCM_VERSION=0.0
|
||||
ARG AMDGPU_RPM=6.2/rhel/9.4/amdgpu-install-6.2.60202-1.el9.noarch.rpm
|
||||
|
||||
@@ -22,7 +22,7 @@ RUN yum groupinstall -y "Development Tools" && \
|
||||
yum install -y --allowerasing cmake curl dpkg-devel numactl-devel \
|
||||
openmpi-devel papi-devel python3-pip texinfo wget which zlib-devel && \
|
||||
yum clean all && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
python3 -m pip install 'cmake==3.21'
|
||||
|
||||
COPY ./dyninst-source /tmp/dyninst
|
||||
|
||||
|
||||
@@ -30,9 +30,9 @@ RUN apt-get update && \
|
||||
python3-pip rpm texinfo wget && \
|
||||
OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed 's/=/ /'1 | awk '{print $NF}' | sed 's/"//g') && \
|
||||
if [ "${OS_VERSION}" == "24.04" ]; then \
|
||||
python3 -m pip install --break-system-packages 'cmake==3.18.4'; \
|
||||
python3 -m pip install --break-system-packages 'cmake==3.21'; \
|
||||
else \
|
||||
python3 -m pip install 'cmake==3.18.4'; \
|
||||
python3 -m pip install 'cmake==3.21'; \
|
||||
fi
|
||||
|
||||
RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \
|
||||
|
||||
@@ -31,9 +31,9 @@ RUN apt-get update && \
|
||||
python3-pip texinfo unzip wget zip zlib1g-dev && \
|
||||
apt-get autoclean && \
|
||||
if [ "${OS_VERSION}" == "24.04" ]; then \
|
||||
python3 -m pip install --break-system-packages 'cmake==3.18.4' \
|
||||
python3 -m pip install --break-system-packages 'cmake==3.21' \
|
||||
else \
|
||||
python3 -m pip install 'cmake==3.18.4'; \
|
||||
python3 -m pip install 'cmake==3.21'; \
|
||||
fi
|
||||
|
||||
COPY ./dyninst-source /tmp/dyninst
|
||||
|
||||
@@ -228,7 +228,7 @@ Generating a default configuration file
|
||||
ROCPROFSYS_PROFILE = false
|
||||
ROCPROFSYS_USE_SAMPLING = false
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING = true
|
||||
ROCPROFSYS_USE_ROCTRACER = true
|
||||
ROCPROFSYS_USE_ROCM = true
|
||||
ROCPROFSYS_USE_ROCM_SMI = true
|
||||
ROCPROFSYS_USE_KOKKOSP = false
|
||||
ROCPROFSYS_USE_CODE_COVERAGE = false
|
||||
@@ -248,9 +248,6 @@ Generating a default configuration file
|
||||
ROCPROFSYS_PERFETTO_FILE = perfetto-trace.proto
|
||||
ROCPROFSYS_PERFETTO_FILL_POLICY = discard
|
||||
ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB = 4096
|
||||
ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = false
|
||||
ROCPROFSYS_ROCTRACER_HSA_API = false
|
||||
ROCPROFSYS_ROCTRACER_HSA_API_TYPES =
|
||||
ROCPROFSYS_SAMPLING_CPUS =
|
||||
ROCPROFSYS_SAMPLING_DELAY = 0.5
|
||||
ROCPROFSYS_SAMPLING_FREQ = 10
|
||||
@@ -363,13 +360,10 @@ Viewing the setting descriptions
|
||||
| ROCPROFSYS_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... |
|
||||
| ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
|
||||
| ROCPROFSYS_PRECISION | Set the global output precision for ... |
|
||||
| ROCPROFSYS_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
|
||||
| ROCPROFSYS_ROCTRACER_HSA_API | Enable HSA API tracing support |
|
||||
| ROCPROFSYS_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
|
||||
| ROCPROFSYS_SAMPLING_CPUS | CPUs to collect frequency informatio... |
|
||||
| ROCPROFSYS_SAMPLING_DELAY | Number of seconds to wait before the... |
|
||||
| ROCPROFSYS_SAMPLING_FREQ | Number of software interrupts per se... |
|
||||
| ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE_... |
|
||||
| ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE... |
|
||||
| ROCPROFSYS_SCIENTIFIC | Set the global numerical reporting t... |
|
||||
| ROCPROFSYS_STRICT_CONFIG | Throw errors for unknown setting nam... |
|
||||
| ROCPROFSYS_SUPPRESS_CONFIG | Disable processing of setting config... |
|
||||
@@ -391,13 +385,13 @@ Viewing the setting descriptions
|
||||
| ROCPROFSYS_TRACE | Enable perfetto backend |
|
||||
| ROCPROFSYS_USE_PID | Enable tagging filenames with proces... |
|
||||
| ROCPROFSYS_USE_ROCM_SMI | Enable sampling GPU power, temp, uti... |
|
||||
| ROCPROFSYS_USE_ROCTRACER | Enable ROCM tracing |
|
||||
| ROCPROFSYS_USE_ROCM | Enable ROCM tracing |
|
||||
| ROCPROFSYS_USE_SAMPLING | Enable statistical sampling of call-... |
|
||||
| ROCPROFSYS_USE_PROCESS_SAMPLING | Enable a background thread which sam... |
|
||||
| ROCPROFSYS_PROFILE | Enable timemory backend |
|
||||
| ROCPROFSYS_VERBOSE | Verbosity level |
|
||||
| ROCPROFSYS_WIDTH | Set the global output width for comp... |
|
||||
|-----------------------------------------|-----------------------------------------|
|
||||
|------------------------------------------|-----------------------------------------|
|
||||
|
||||
Viewing components
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@@ -268,8 +268,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
|
||||
$ rocprof-sys-sample -- ./parallel-overhead-locks 30 4 100
|
||||
|
||||
HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
HSA_TOOLS_REPORT_LOAD_FAILURE=1
|
||||
LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=false
|
||||
ROCPROFSYS_USE_SAMPLING=true
|
||||
@@ -283,8 +281,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
|
||||
$ rocprof-sys-sample -PTDH -I all -- ./parallel-overhead-locks 30 4 100
|
||||
|
||||
HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
HSA_TOOLS_REPORT_LOAD_FAILURE=1
|
||||
KOKKOS_PROFILE_LIBRARY=/opt/rocprofiler-systems/lib/librocprof-sys.so.1.7.1
|
||||
LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
ROCPROFSYS_CPU_FREQ_ENABLED=true
|
||||
@@ -298,9 +294,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_RCCLP=true
|
||||
ROCPROFSYS_USE_ROCM_SMI=true
|
||||
ROCPROFSYS_USE_ROCPROFILER=true
|
||||
ROCPROFSYS_USE_ROCTRACER=true
|
||||
ROCPROFSYS_USE_ROCTX=true
|
||||
ROCPROFSYS_USE_ROCM=true
|
||||
ROCPROFSYS_USE_SAMPLING=true
|
||||
ROCPROFSYS_PROFILE=true
|
||||
OMP_TOOL_LIBRARIES=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
@@ -330,9 +324,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_RCCLP=false
|
||||
ROCPROFSYS_USE_ROCM_SMI=false
|
||||
ROCPROFSYS_USE_ROCPROFILER=false
|
||||
ROCPROFSYS_USE_ROCTRACER=false
|
||||
ROCPROFSYS_USE_ROCTX=false
|
||||
ROCPROFSYS_USE_ROCM=false
|
||||
ROCPROFSYS_USE_SAMPLING=true
|
||||
ROCPROFSYS_PROFILE=true
|
||||
...
|
||||
@@ -363,9 +355,7 @@ Here is the full output from the previous
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_RCCLP=false
|
||||
ROCPROFSYS_USE_ROCM_SMI=false
|
||||
ROCPROFSYS_USE_ROCPROFILER=false
|
||||
ROCPROFSYS_USE_ROCTRACER=false
|
||||
ROCPROFSYS_USE_ROCTX=false
|
||||
ROCPROFSYS_USE_ROCM=false
|
||||
ROCPROFSYS_USE_SAMPLING=true
|
||||
[rocprof-sys][dl][1785877] rocprofsys_main
|
||||
[rocprof-sys][1785877][rocprofsys_init_tooling] Instrumentation mode: Sampling
|
||||
|
||||
@@ -241,8 +241,8 @@ Installing ROCm Systems Profiler
|
||||
-----------------------------------
|
||||
|
||||
ROCm Systems Profiler has CMake configuration options for MPI support (``ROCPROFSYS_USE_MPI`` or
|
||||
``ROCPROFSYS_USE_MPI_HEADERS``), HIP kernel tracing (``ROCPROFSYS_USE_ROCTRACER``),
|
||||
ROCm device sampling (``ROCPROFSYS_USE_ROCM_SMI``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``),
|
||||
``ROCPROFSYS_USE_MPI_HEADERS``),
|
||||
ROCm tracing and sampling (``ROCPROFSYS_USE_ROCM``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``),
|
||||
hardware counters via PAPI (``ROCPROFSYS_USE_PAPI``), among other features.
|
||||
Various additional features can be enabled via the
|
||||
``TIMEMORY_USE_*`` `CMake options <https://timemory.readthedocs.io/en/develop/installation.html#cmake-options>`_.
|
||||
@@ -256,22 +256,20 @@ in `the Perfetto UI <https://ui.perfetto.dev>`_.
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/rocprofiler-systems.git rocprof-sys-source
|
||||
cmake \
|
||||
-B rocprof-sys-build \
|
||||
cmake \
|
||||
-B rocprof-sys-build \
|
||||
-D CMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \
|
||||
-D ROCPROFSYS_USE_HIP=ON \
|
||||
-D ROCPROFSYS_USE_ROCM_SMI=ON \
|
||||
-D ROCPROFSYS_USE_ROCTRACER=ON \
|
||||
-D ROCPROFSYS_USE_PYTHON=ON \
|
||||
-D ROCPROFSYS_USE_OMPT=ON \
|
||||
-D ROCPROFSYS_USE_MPI_HEADERS=ON \
|
||||
-D ROCPROFSYS_BUILD_PAPI=ON \
|
||||
-D ROCPROFSYS_BUILD_LIBUNWIND=ON \
|
||||
-D ROCPROFSYS_BUILD_DYNINST=ON \
|
||||
-D DYNINST_BUILD_TBB=ON \
|
||||
-D DYNINST_BUILD_BOOST=ON \
|
||||
-D DYNINST_BUILD_ELFUTILS=ON \
|
||||
-D DYNINST_BUILD_LIBIBERTY=ON \
|
||||
-D ROCPROFSYS_USE_ROCM=ON \
|
||||
-D ROCPROFSYS_USE_PYTHON=ON \
|
||||
-D ROCPROFSYS_USE_OMPT=ON \
|
||||
-D ROCPROFSYS_USE_MPI_HEADERS=ON \
|
||||
-D ROCPROFSYS_BUILD_PAPI=ON \
|
||||
-D ROCPROFSYS_BUILD_LIBUNWIND=ON \
|
||||
-D ROCPROFSYS_BUILD_DYNINST=ON \
|
||||
-D DYNINST_BUILD_TBB=ON \
|
||||
-D DYNINST_BUILD_BOOST=ON \
|
||||
-D DYNINST_BUILD_ELFUTILS=ON \
|
||||
-D DYNINST_BUILD_LIBIBERTY=ON \
|
||||
rocprof-sys-source
|
||||
cmake --build rocprof-sys-build --target all --parallel 8
|
||||
cmake --build rocprof-sys-build --target install
|
||||
|
||||
@@ -372,7 +372,7 @@ if [ "${IS_DOCKER}" -ne 0 ]; then git config --global --add safe.directory ${PWD
|
||||
|
||||
verbose-run echo "Build rocprofiler-systems installers with generators: ${GENERATORS}"
|
||||
|
||||
build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=OFF
|
||||
build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=ON
|
||||
build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=OFF
|
||||
build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=ON
|
||||
build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=OFF
|
||||
build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=ON
|
||||
build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=OFF
|
||||
build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=ON
|
||||
|
||||
@@ -1,17 +1,8 @@
|
||||
# executable RPATH
|
||||
|
||||
if(ROCPROFSYS_USE_ROCPROFILER
|
||||
AND rocprofiler_LIBRARY_DIR
|
||||
AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0
|
||||
AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
set(ROCPROFSYS_EXE_INSTALL_RPATH
|
||||
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}"
|
||||
)
|
||||
else()
|
||||
set(ROCPROFSYS_EXE_INSTALL_RPATH
|
||||
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}"
|
||||
)
|
||||
endif()
|
||||
set(ROCPROFSYS_EXE_INSTALL_RPATH
|
||||
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}"
|
||||
)
|
||||
|
||||
# executables
|
||||
add_subdirectory(rocprof-sys-avail)
|
||||
|
||||
@@ -33,8 +33,7 @@
|
||||
#include "api.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/gpu.hpp"
|
||||
#include "core/hip_runtime.hpp"
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "library/rocm.hpp"
|
||||
|
||||
#include <timemory/components.hpp>
|
||||
#include <timemory/components/definition.hpp>
|
||||
@@ -119,7 +118,7 @@ write_hw_counter_info(std::ostream&, const array_t<bool, N>& = {},
|
||||
namespace
|
||||
{
|
||||
// initialize HIP before main so that librocprof-sys is not HSA_TOOLS_LIB
|
||||
int gpu_count = rocprofsys::gpu::hip_device_count();
|
||||
int gpu_count = rocprofsys::gpu::device_count();
|
||||
|
||||
// statically allocated shared_ptrs to prevent use after free errors
|
||||
auto timemory_manager = tim::manager::master_instance();
|
||||
@@ -508,15 +507,15 @@ main(int argc, char** argv)
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
#if ROCPROFSYS_USE_HIP > 0
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
if(gpu_count > 0)
|
||||
{
|
||||
size_t _num_metrics = 0;
|
||||
try
|
||||
{
|
||||
// call to rocm_metrics() will add choices to ROCPROFSYS_ROCM_EVENTS setting
|
||||
// call to rocm_events() will add choices to ROCPROFSYS_ROCM_EVENTS setting
|
||||
// so always perform this call even if list of HW counters is not requested
|
||||
_num_metrics = rocprofsys::rocprofiler::rocm_metrics().size();
|
||||
_num_metrics = rocprofsys::rocm::rocm_events().size();
|
||||
} catch(std::runtime_error& _e)
|
||||
{
|
||||
verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what());
|
||||
@@ -615,9 +614,9 @@ main(int argc, char** argv)
|
||||
}
|
||||
}
|
||||
|
||||
signal(SIGABRT, &dump_log_abort);
|
||||
signal(SIGSEGV, &dump_log_abort);
|
||||
signal(SIGQUIT, &dump_log_abort);
|
||||
// signal(SIGABRT, &dump_log_abort);
|
||||
// signal(SIGSEGV, &dump_log_abort);
|
||||
// signal(SIGQUIT, &dump_log_abort);
|
||||
|
||||
if(!os) os = &std::cout;
|
||||
|
||||
@@ -641,6 +640,8 @@ main(int argc, char** argv)
|
||||
}
|
||||
dump_log();
|
||||
|
||||
const_cast<std::shared_ptr<tim::settings>&>(tim::settings::shared_instance()).reset();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1076,7 +1077,7 @@ write_hw_counter_info(std::ostream& os, const array_t<bool, N>& options,
|
||||
|
||||
auto _papi_events = tim::papi::available_events_info();
|
||||
auto _rocm_events =
|
||||
(gpu_count > 0) ? rocprofsys::rocprofiler::rocm_metrics() : hwcounter_info_t{};
|
||||
(gpu_count > 0) ? rocprofsys::rocm::rocm_events() : hwcounter_info_t{};
|
||||
|
||||
if(alphabetical)
|
||||
{
|
||||
|
||||
@@ -339,7 +339,7 @@ generate_config(std::string _config_file, const std::set<std::string>& _config_f
|
||||
for(const auto* itr :
|
||||
{ "ROCPROFSYS_CONFIG", "ROCPROFSYS_MODE", "ROCPROFSYS_TRACE",
|
||||
"ROCPROFSYS_PROFILE", "ROCPROFSYS_USE_SAMPLING",
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCTRACER",
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCM",
|
||||
"ROCPROFSYS_USE_ROCM_SMI", "ROCPROFSYS_USE_KOKKOSP",
|
||||
"ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", "ROCPROFSYS_OUTPUT" })
|
||||
{
|
||||
|
||||
@@ -29,8 +29,6 @@
|
||||
#include "library/components/fork_gotcha.hpp"
|
||||
#include "library/components/mpi_gotcha.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/rocprofiler.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
|
||||
#include <timemory/components/definition.hpp>
|
||||
#include <timemory/enum.h>
|
||||
|
||||
@@ -752,10 +752,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
|
||||
|
||||
parser.end_group();
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION > 0 && ROCPROFSYS_HIP_VERSION < 50300
|
||||
update_env(_env, "HSA_ENABLE_INTERRUPT", 0);
|
||||
#endif
|
||||
|
||||
auto _inpv = std::vector<char*>{};
|
||||
auto _outv = std::vector<char*>{};
|
||||
bool _hash = false;
|
||||
@@ -824,11 +820,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
|
||||
add_default_env(_env, "ROCPROFSYS_USE_MPIP", true);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
add_default_env(_env, "ROCPROFSYS_ROCTRACER_HIP_API", true);
|
||||
add_default_env(_env, "ROCPROFSYS_ROCTRACER_HSA_API", true);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
|
||||
add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true);
|
||||
#endif
|
||||
|
||||
@@ -35,6 +35,8 @@ target_link_libraries(
|
||||
timemory::timemory-extensions
|
||||
timemory::timemory-core)
|
||||
|
||||
add_target_flag_if_avail(rocprofiler-systems-instrument "-Wno-deprecated-declarations")
|
||||
|
||||
set_target_properties(
|
||||
rocprofiler-systems-instrument
|
||||
PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
|
||||
|
||||
@@ -312,13 +312,25 @@ get_internal_basic_libs_impl()
|
||||
"liblzma.so" };
|
||||
|
||||
// shared libraries used by rocprof-sys
|
||||
const auto _omni_libs = strview_init_t{
|
||||
"libstdc++.so.6", "libgotcha.so", "libunwind-coredump.so",
|
||||
"libunwind-generic.so", "libunwind-ptrace.so", "libunwind-setjmp.so",
|
||||
"libunwind.so", "libunwind-x86_64.so", "librocm_smi64.so",
|
||||
"libroctx64.so", "librocmtools.so", "libroctracer64.so",
|
||||
"librocprofiler64.so", "libpapi.so", "libpfm.so"
|
||||
};
|
||||
const auto _omni_libs = strview_init_t{ "libstdc++.so.6",
|
||||
"libgotcha.so",
|
||||
"libunwind-coredump.so",
|
||||
"libunwind-generic.so",
|
||||
"libunwind-ptrace.so",
|
||||
"libunwind-setjmp.so",
|
||||
"libunwind.so",
|
||||
"libunwind-x86_64.so",
|
||||
"librocm_smi64.so",
|
||||
"libroctx64.so",
|
||||
"librocmtools.so",
|
||||
"libroctracer64.so",
|
||||
"librocprofiler64.so",
|
||||
"libpapi.so",
|
||||
"libpfm.so",
|
||||
"librocprofiler-register.so",
|
||||
"librocprofiler-sdk.so",
|
||||
"librocprofiler-sdk-roctx.so",
|
||||
"libamd_smi.so" };
|
||||
|
||||
// shared libraries potentially used by timemory
|
||||
const auto _3rdparty_libs = strview_init_t{ "libcaliper.so",
|
||||
|
||||
+6
-4
@@ -357,10 +357,12 @@ main(int argc, char** argv)
|
||||
itr.find("rocprof-sys") != std::string::npos ||
|
||||
itr.find("rocprofiler-systems") != std::string::npos ||
|
||||
std::regex_search(
|
||||
itr, std::regex{ "lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|"
|
||||
"instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|"
|
||||
"tbbmalloc|tbbmalloc_proxy|gotcha|libunwind|roctracer|"
|
||||
"hsa-runtime|amdhip|rocm_smi)\\.(so|a)" }))
|
||||
itr, std::regex{
|
||||
"lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|"
|
||||
"instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|tbbmalloc|"
|
||||
"tbbmalloc_proxy|gotcha|libunwind|roctracer64|hsa-runtime|amdhip|"
|
||||
"amd_comgr|rocm_smi64|rocprofiler64|rocprofiler-register|"
|
||||
"rocprofiler-sdk|rocprofiler-sdk-roctx|amd_smi)\\.(so|a)" }))
|
||||
{
|
||||
if(!find(filepath::dirname(itr), lib_search_paths))
|
||||
lib_search_paths.emplace_back(filepath::dirname(itr));
|
||||
|
||||
@@ -44,14 +44,6 @@
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
# define ROCPROFSYS_USE_ROCTRACER 0
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
# define ROCPROFSYS_USE_ROCPROFILER 0
|
||||
#endif
|
||||
|
||||
namespace color = tim::log::color;
|
||||
using namespace timemory::join;
|
||||
using tim::get_env;
|
||||
@@ -140,17 +132,6 @@ get_initial_environment()
|
||||
|
||||
update_env(_env, "ROCPROFSYS_USE_SAMPLING", (_mode != "causal"));
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
update_env(_env, "HSA_TOOLS_LIB", _dl_libpath);
|
||||
if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE"))
|
||||
update_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1");
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
update_env(_env, "ROCP_TOOL_LIB", _omni_libpath);
|
||||
if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_env, "ROCP_HSA_INTERCEPT", "1");
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_OMPT)
|
||||
if(!getenv("OMP_TOOL_LIBRARIES"))
|
||||
update_env(_env, "OMP_TOOL_LIBRARIES", _dl_libpath, UPD_APPEND);
|
||||
@@ -357,14 +338,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
%{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance
|
||||
%{INDENT}% 1 do not modify how ROCm is notified about kernel completion)";
|
||||
|
||||
auto _realtime_reqs = (get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty())
|
||||
? std::vector<std::string>{ "hsa-interrupt" }
|
||||
: std::vector<std::string>{};
|
||||
|
||||
#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0
|
||||
_realtime_reqs.clear();
|
||||
#endif
|
||||
|
||||
const auto* _trace_policy_desc =
|
||||
R"(Policy for new data when the buffer size limit is reached:
|
||||
%{INDENT}%- discard : new data is ignored
|
||||
@@ -720,7 +693,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
|
||||
parser.add_argument({ "--realtime" }, _realtime_desc)
|
||||
.min_count(0)
|
||||
.required(std::move(_realtime_reqs))
|
||||
.action([&](parser_t& p) {
|
||||
auto _v = p.get<std::deque<std::string>>("realtime");
|
||||
update_env(_env, "ROCPROFSYS_SAMPLING_REALTIME", true);
|
||||
@@ -741,10 +713,20 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
}
|
||||
});
|
||||
|
||||
std::set<std::string> _backend_choices = { "all", "kokkosp", "mpip",
|
||||
"ompt", "rcclp", "rocm-smi",
|
||||
"roctracer", "rocprofiler", "roctx",
|
||||
"mutex-locks", "spin-locks", "rw-locks" };
|
||||
std::set<std::string> _backend_choices = { "all",
|
||||
"kokkosp",
|
||||
"mpip",
|
||||
"ompt",
|
||||
"rcclp",
|
||||
"rocm-smi",
|
||||
"roctracer",
|
||||
"rocprofiler",
|
||||
"roctx",
|
||||
"mutex-locks",
|
||||
"spin-locks",
|
||||
"rw-locks",
|
||||
"rocprofiler-sdk",
|
||||
"rocm" };
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
|
||||
_backend_choices.erase("mpip");
|
||||
@@ -758,17 +740,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
_backend_choices.erase("rocm");
|
||||
_backend_choices.erase("rocm-smi");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
_backend_choices.erase("roctracer");
|
||||
_backend_choices.erase("roctx");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
_backend_choices.erase("rocprofiler");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
#endif
|
||||
|
||||
parser.start_group("BACKEND OPTIONS",
|
||||
@@ -784,11 +759,9 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
|
||||
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
|
||||
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
|
||||
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
|
||||
@@ -810,27 +783,18 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
|
||||
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
|
||||
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
|
||||
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
|
||||
|
||||
if(_v.count("all") > 0 ||
|
||||
(_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0))
|
||||
{
|
||||
remove_env(_env, "HSA_TOOLS_LIB");
|
||||
remove_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE");
|
||||
}
|
||||
|
||||
if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
|
||||
{
|
||||
remove_env(_env, "ROCP_TOOL_LIB");
|
||||
remove_env(_env, "ROCP_HSA_INTERCEPT");
|
||||
}
|
||||
// if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
|
||||
// {
|
||||
// remove_env(_env, "ROCP_TOOL_LIB");
|
||||
// remove_env(_env, "ROCP_HSA_INTERCEPT");
|
||||
// }
|
||||
|
||||
if(_v.count("all") > 0 || _v.count("ompt") > 0)
|
||||
remove_env(_env, "OMP_TOOL_LIBRARIES");
|
||||
@@ -850,18 +814,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
update_env(_env, "ROCPROFSYS_PAPI_EVENTS", _events);
|
||||
});
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
parser
|
||||
.add_argument({ "-G", "--gpu-events" },
|
||||
"Set the GPU hardware counter events to record (ref: "
|
||||
"`rocprof-sys-avail -H -c GPU`)")
|
||||
.action([&](parser_t& p) {
|
||||
auto _events =
|
||||
join(array_config{ "," }, p.get<std::vector<std::string>>("gpu-events"));
|
||||
update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events);
|
||||
});
|
||||
#endif
|
||||
|
||||
parser.start_group("MISCELLANEOUS OPTIONS", "");
|
||||
parser
|
||||
.add_argument({ "-i", "--inlines" },
|
||||
|
||||
@@ -12,15 +12,7 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20)
|
||||
cmake_policy(SET CMP0115 NEW)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCPROFILER
|
||||
AND rocprofiler_LIBRARY_DIR
|
||||
AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0
|
||||
AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
set(ROCPROFSYS_LIB_INSTALL_RPATH
|
||||
"\$ORIGIN:\$ORIGIN/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}")
|
||||
else()
|
||||
set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}")
|
||||
endif()
|
||||
set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}")
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
#
|
||||
@@ -50,10 +42,7 @@ target_link_libraries(
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-bfd>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-mpi>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-ptl>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-hip>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-roctracer>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocprofiler>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm-smi>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rccl>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
|
||||
|
||||
@@ -19,7 +19,9 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/environment.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/join.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/static_object.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/synchronized.hpp)
|
||||
|
||||
get_filename_component(COMMON_SOURCE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" DIRECTORY)
|
||||
get_filename_component(COMMON_BINARY_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}" DIRECTORY)
|
||||
|
||||
@@ -42,10 +42,10 @@
|
||||
#define ROCPROFSYS_COMPILER_STRING ROCPROFSYS_COMPILER_ID " v" ROCPROFSYS_COMPILER_VERSION
|
||||
|
||||
#define ROCPROFSYS_DEFAULT_ROCM_PATH "@ROCmVersion_DIR@"
|
||||
#define ROCPROFSYS_HIP_VERSION_STRING "@ROCPROFSYS_HIP_VERSION@"
|
||||
#define ROCPROFSYS_HIP_VERSION_MAJOR @ROCPROFSYS_HIP_VERSION_MAJOR@
|
||||
#define ROCPROFSYS_HIP_VERSION_MINOR @ROCPROFSYS_HIP_VERSION_MINOR@
|
||||
#define ROCPROFSYS_HIP_VERSION_PATCH @ROCPROFSYS_HIP_VERSION_PATCH@
|
||||
#define ROCPROFSYS_ROCM_VERSION_STRING "@ROCPROFSYS_ROCM_VERSION@"
|
||||
#define ROCPROFSYS_ROCM_VERSION_MAJOR @ROCPROFSYS_ROCM_VERSION_MAJOR@
|
||||
#define ROCPROFSYS_ROCM_VERSION_MINOR @ROCPROFSYS_ROCM_VERSION_MINOR@
|
||||
#define ROCPROFSYS_ROCM_VERSION_PATCH @ROCPROFSYS_ROCM_VERSION_PATCH@
|
||||
|
||||
// these can be set via defining the variable in CMake, e.g.:
|
||||
// cmake -D ROCPROFSYS_CACHELINE_SIZE=N /path/to/source
|
||||
@@ -63,15 +63,15 @@
|
||||
((10000 * ROCPROFSYS_VERSION_MAJOR) + (100 * ROCPROFSYS_VERSION_MINOR) + \
|
||||
ROCPROFSYS_VERSION_PATCH)
|
||||
|
||||
#define ROCPROFSYS_HIP_VERSION \
|
||||
((10000 * ROCPROFSYS_HIP_VERSION_MAJOR) + (100 * ROCPROFSYS_HIP_VERSION_MINOR) + \
|
||||
ROCPROFSYS_HIP_VERSION_PATCH)
|
||||
#define ROCPROFSYS_ROCM_VERSION \
|
||||
((10000 * ROCPROFSYS_ROCM_VERSION_MAJOR) + (100 * ROCPROFSYS_ROCM_VERSION_MINOR) + \
|
||||
ROCPROFSYS_ROCM_VERSION_PATCH)
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION_MAJOR > 0
|
||||
# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING \
|
||||
"v@ROCPROFSYS_HIP_VERSION_MAJOR@.@ROCPROFSYS_HIP_VERSION_MINOR@.x"
|
||||
#if ROCPROFSYS_ROCM_VERSION_MAJOR > 0
|
||||
# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \
|
||||
"v@ROCPROFSYS_ROCM_VERSION_MAJOR@.@ROCPROFSYS_ROCM_VERSION_MINOR@.x"
|
||||
#else
|
||||
# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING ""
|
||||
# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING ""
|
||||
#endif
|
||||
|
||||
// this should be passed to argparse::argument_parser::enable_version
|
||||
@@ -83,7 +83,7 @@
|
||||
{ \
|
||||
{ "", ROCPROFSYS_LIBRARY_ARCH }, { "compiler", ROCPROFSYS_COMPILER_STRING }, \
|
||||
{ \
|
||||
"rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING \
|
||||
"rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -109,148 +109,6 @@ get_environ(int _verbose, std::string _search_paths = {},
|
||||
_omnilib = common::path::find_path(_omnilib, _verbose, _search_paths);
|
||||
_omnilib_dl = common::path::find_path(_omnilib_dl, _verbose, _search_paths);
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
_data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 });
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
# if ROCPROFSYS_HIP_VERSION >= 50200
|
||||
# define ROCPROFILER_METRICS_DIR "lib/rocprofiler"
|
||||
# else
|
||||
# define ROCPROFILER_METRICS_DIR "rocprofiler/lib"
|
||||
# endif
|
||||
# if ROCPROFSYS_HIP_VERSION <= 50500
|
||||
# define ROCPROFILER_LIBNAME "librocprofiler64.so"
|
||||
# else
|
||||
# define ROCPROFILER_LIBNAME "librocprofiler64.so.1"
|
||||
# endif
|
||||
|
||||
_data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 });
|
||||
_data.emplace_back(env_config{ "ROCP_TOOL_LIB", _omnilib.c_str(), 0 });
|
||||
_data.emplace_back(env_config{ "ROCPROFILER_LOG", "1", 0 });
|
||||
_data.emplace_back(env_config{ "ROCP_HSA_INTERCEPT", "1", 0 });
|
||||
_data.emplace_back(env_config{ "HSA_TOOLS_REPORT_LOAD_FAILURE", "1", 0 });
|
||||
|
||||
auto _possible_rocp_metrics = std::vector<std::string>{};
|
||||
auto _possible_rocprof_libs = std::vector<std::string>{};
|
||||
for(const auto* itr : { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" })
|
||||
{
|
||||
if(getenv(itr))
|
||||
{
|
||||
_possible_rocp_metrics.emplace_back(
|
||||
common::join('/', getenv(itr), "lib/rocprofiler"));
|
||||
_possible_rocprof_libs.emplace_back(
|
||||
common::join('/', getenv(itr), "lib/rocprofiler", ROCPROFILER_LIBNAME));
|
||||
_possible_rocp_metrics.emplace_back(
|
||||
common::join('/', getenv(itr), "rocprofiler/lib"));
|
||||
_possible_rocprof_libs.emplace_back(
|
||||
common::join('/', getenv(itr), "rocprofiler/lib", ROCPROFILER_LIBNAME));
|
||||
}
|
||||
}
|
||||
|
||||
// default path
|
||||
_possible_rocp_metrics.emplace_back(
|
||||
common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "lib/rocprofiler"));
|
||||
_possible_rocp_metrics.emplace_back(
|
||||
common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "rocprofiler/lib"));
|
||||
|
||||
auto _realpath_and_unique = [](const auto& _inp_v) {
|
||||
auto _out_v = decltype(_inp_v){};
|
||||
for(auto& itr : _inp_v)
|
||||
{
|
||||
if(path::exists(itr)) _out_v.emplace_back(path::realpath(itr));
|
||||
}
|
||||
|
||||
_out_v.erase(std::unique(_out_v.begin(), _out_v.end()), _out_v.end());
|
||||
return _out_v;
|
||||
};
|
||||
|
||||
_possible_rocprof_libs = _realpath_and_unique(_possible_rocprof_libs);
|
||||
|
||||
for(const auto& itr : _possible_rocprof_libs)
|
||||
{
|
||||
if(path::exists(itr))
|
||||
{
|
||||
_data.emplace_back(
|
||||
env_config{ "ROCPROFSYS_ROCPROFILER_LIBRARY", itr.c_str(), 0 });
|
||||
_possible_rocp_metrics.emplace(
|
||||
_possible_rocp_metrics.begin(),
|
||||
common::join('/', path::dirname(itr), "../../lib/rocprofiler"));
|
||||
_possible_rocp_metrics.emplace(_possible_rocp_metrics.begin(),
|
||||
common::join('/', path::dirname(itr)));
|
||||
}
|
||||
}
|
||||
|
||||
_possible_rocp_metrics = _realpath_and_unique(_possible_rocp_metrics);
|
||||
|
||||
auto _env_rocp_metrics = get_env("ROCP_METRICS", "");
|
||||
if(!_env_rocp_metrics.empty())
|
||||
{
|
||||
if(!path::exists(_env_rocp_metrics))
|
||||
throw std::runtime_error(join("", "Error! ROCP_METRICS file \"",
|
||||
_env_rocp_metrics, "\" does not exist"));
|
||||
_possible_rocp_metrics.clear();
|
||||
_possible_rocp_metrics.emplace_back(
|
||||
common::join('/', path::dirname(_env_rocp_metrics)));
|
||||
}
|
||||
|
||||
auto _found_rocp_metrics = (!_env_rocp_metrics.empty())
|
||||
? get_env("ROCPROFSYS_ROCP_METRICS_FORCE_VALID", false)
|
||||
: false;
|
||||
|
||||
if(!_found_rocp_metrics)
|
||||
{
|
||||
for(const auto& itr : _possible_rocp_metrics)
|
||||
{
|
||||
auto _metrics_path = join('/', itr, "metrics.xml");
|
||||
if(path::exists(itr) && path::exists(_metrics_path) &&
|
||||
path::exists(join('/', itr, "gfx_metrics.xml")))
|
||||
{
|
||||
_found_rocp_metrics = true;
|
||||
_data.emplace_back(
|
||||
env_config{ "ROCP_METRICS", _metrics_path.c_str(), 0 });
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle error
|
||||
if(!_found_rocp_metrics)
|
||||
{
|
||||
auto _msg = std::stringstream{};
|
||||
_msg << std::boolalpha;
|
||||
if(!_env_rocp_metrics.empty())
|
||||
{
|
||||
auto _env_rocp_metrics_dir = path::dirname(_env_rocp_metrics);
|
||||
auto _rocp_metrics_xml = join('/', _env_rocp_metrics_dir, "metrics.xml");
|
||||
auto _rocp_gfx_metrics_xml =
|
||||
join('/', _env_rocp_metrics_dir, "gfx_metrics.xml");
|
||||
_msg << "Error! ROCP_METRICS=\"" << _env_rocp_metrics
|
||||
<< "\" in the environment but the directory (" << _env_rocp_metrics_dir
|
||||
<< ") does not contain "
|
||||
"metrics.xml (found: "
|
||||
<< path::exists(_rocp_metrics_xml) << ") and/or gfx_metrics.xml (found: "
|
||||
<< path::exists(_rocp_gfx_metrics_xml)
|
||||
<< "). To ignore this error, set "
|
||||
"ROCPROFSYS_ROCP_METRICS_FORCE_VALID=true in the environment";
|
||||
}
|
||||
else
|
||||
{
|
||||
_msg
|
||||
<< "Error! ROCP_METRICS not set in environment and rocprof-sys could not "
|
||||
"find a suitable path. Please set ROCP_METRICS=/path/to/metrics.xml "
|
||||
"in the environment. This file is typically located in the same "
|
||||
"folder as the librocprofiler64.so library.\nAdditional note: "
|
||||
"metrics.xml typically contains:\n\t#include "
|
||||
"\"gfx_metrics.xml\"\nMake sure the provided path also contains this "
|
||||
"file.\nExample:\n\texport ROCP_METRICS="
|
||||
<< ROCPROFSYS_DEFAULT_ROCM_PATH << "/" << ROCPROFILER_METRICS_DIR
|
||||
<< "/metrics.xml\n";
|
||||
}
|
||||
throw std::runtime_error(_msg.str());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0
|
||||
if(get_env("ROCPROFSYS_USE_OMPT", true))
|
||||
{
|
||||
|
||||
@@ -0,0 +1,207 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <deque>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <stack>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
inline namespace common
|
||||
{
|
||||
using static_dtor_func_t = void (*)();
|
||||
|
||||
void
|
||||
destroy_static_objects();
|
||||
|
||||
void
|
||||
register_static_dtor(static_dtor_func_t&&);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct anonymous
|
||||
{};
|
||||
} // namespace
|
||||
|
||||
struct do_not_destroy
|
||||
{};
|
||||
|
||||
template <typename Tp>
|
||||
constexpr size_t
|
||||
static_buffer_size()
|
||||
{
|
||||
return sizeof(Tp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief This struct is used to create static singleton objects which have the properties
|
||||
* of a heap-allocated static object without a memory leak.
|
||||
*
|
||||
* @tparam Tp Data type of singleton
|
||||
* @tparam ContextT Use to differentiate singletons in different translation units (if
|
||||
* using default parameter) or ensure the singleton can be accessed in different
|
||||
* translation units (not recommended) as long as this type is not in an anonymous
|
||||
* namespace
|
||||
*
|
||||
* This template works by creating a buffer of at least `sizeof(Tp)` bytes in the binary
|
||||
* and does a placement new into that buffer. The object created is NOT heap allocated,
|
||||
* the address of the object is an address in between the library load address and the
|
||||
* load address + size of library.
|
||||
*/
|
||||
template <typename Tp, typename ContextT = anonymous>
|
||||
struct static_object
|
||||
{
|
||||
static_object() = delete;
|
||||
~static_object() = delete;
|
||||
static_object(const static_object&) = delete;
|
||||
static_object(static_object&&) noexcept = delete;
|
||||
static_object& operator=(const static_object&) = delete;
|
||||
static_object& operator=(static_object&&) noexcept = delete;
|
||||
|
||||
template <typename... Args>
|
||||
static Tp*& construct(Args&&... args);
|
||||
|
||||
template <typename... Args>
|
||||
static Tp*& construct(do_not_destroy&&, Args&&... args);
|
||||
|
||||
static Tp* get() { return m_object; }
|
||||
|
||||
static constexpr bool is_trivial_standard_layout();
|
||||
|
||||
private:
|
||||
static Tp* m_object;
|
||||
static std::array<std::byte, static_buffer_size<Tp>()> m_buffer;
|
||||
};
|
||||
|
||||
template <typename Tp, typename ContextT>
|
||||
Tp* static_object<Tp, ContextT>::m_object = nullptr;
|
||||
|
||||
template <typename Tp, typename ContextT>
|
||||
std::array<std::byte, static_buffer_size<Tp>()>
|
||||
static_object<Tp, ContextT>::m_buffer = {};
|
||||
|
||||
template <typename Tp, typename ContextT>
|
||||
constexpr bool
|
||||
static_object<Tp, ContextT>::is_trivial_standard_layout()
|
||||
{
|
||||
return (std::is_standard_layout<Tp>::value && std::is_trivial<Tp>::value);
|
||||
}
|
||||
|
||||
template <typename Tp, typename ContextT>
|
||||
template <typename... Args>
|
||||
Tp*&
|
||||
static_object<Tp, ContextT>::construct(Args&&... args)
|
||||
{
|
||||
if constexpr(!is_trivial_standard_layout())
|
||||
{
|
||||
static auto _once = std::once_flag{};
|
||||
std::call_once(_once, []() {
|
||||
register_static_dtor([]() {
|
||||
if(static_object<Tp, ContextT>::m_object)
|
||||
{
|
||||
static_object<Tp, ContextT>::m_object->~Tp();
|
||||
static_object<Tp, ContextT>::m_object = nullptr;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if(m_object)
|
||||
{
|
||||
std::cerr
|
||||
<< "reconstructing static object. Use get() function to retrieve pointer"
|
||||
<< std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
m_object = new(m_buffer.data()) Tp{ std::forward<Args>(args)... };
|
||||
return m_object;
|
||||
}
|
||||
|
||||
template <typename Tp, typename ContextT>
|
||||
template <typename... Args>
|
||||
Tp*&
|
||||
static_object<Tp, ContextT>::construct(do_not_destroy&&, Args&&... args)
|
||||
{
|
||||
if(m_object)
|
||||
{
|
||||
std::cerr
|
||||
<< "reconstructing static object. Use get() function to retrieve pointer"
|
||||
<< std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
m_object = new(m_buffer.data()) Tp{ std::forward<Args>(args)... };
|
||||
return m_object;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
inline auto*&
|
||||
get_static_object_stack()
|
||||
{
|
||||
static auto* _v = new std::stack<static_dtor_func_t>{};
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
inline void
|
||||
destroy_static_objects()
|
||||
{
|
||||
static auto _sync = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{ _sync };
|
||||
|
||||
auto*& _stack = get_static_object_stack();
|
||||
if(_stack)
|
||||
{
|
||||
while(!_stack->empty())
|
||||
{
|
||||
auto& itr = _stack->top();
|
||||
if(itr) itr();
|
||||
_stack->pop();
|
||||
}
|
||||
|
||||
delete _stack;
|
||||
_stack = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
register_static_dtor(static_dtor_func_t&& _func)
|
||||
{
|
||||
static auto _sync = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{ _sync };
|
||||
|
||||
auto*& _stack = get_static_object_stack();
|
||||
if(_stack)
|
||||
{
|
||||
_stack->push(_func);
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace rocprofsys
|
||||
@@ -0,0 +1,167 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
#include <type_traits>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
inline namespace common
|
||||
{
|
||||
/**
|
||||
* Sychronized is a wrapper that adds lock based write/read
|
||||
* protection around a datatype. The protected data is accessed
|
||||
* only by rlock/wlock. rlock(lambda) gets a reader lock of the
|
||||
* protected value, passing the protected value to the lambda as a
|
||||
* const. wlock(lambda) gets a writer lock on the protective value
|
||||
* and does the same. The reason for this class is to make it less
|
||||
* error prone to access shared data and more obvious when a lock
|
||||
* is being held.
|
||||
*
|
||||
* Example usage:
|
||||
*
|
||||
* synchronized<int> x(9);
|
||||
* x.rlock([](const auto& data){
|
||||
* // data = 9
|
||||
* });
|
||||
*
|
||||
* x.wlock([](auto& data){
|
||||
* // set data to new value
|
||||
* });
|
||||
*/
|
||||
template <typename LockedType, bool IsMappedTypeV = false>
|
||||
class synchronized
|
||||
{
|
||||
public:
|
||||
using value_type = LockedType;
|
||||
using this_type = synchronized<value_type, IsMappedTypeV>;
|
||||
|
||||
synchronized() = default;
|
||||
~synchronized() = default;
|
||||
|
||||
explicit synchronized(value_type&& data)
|
||||
: m_data{ std::move(data) }
|
||||
{}
|
||||
|
||||
synchronized(synchronized&& data) noexcept = default;
|
||||
synchronized& operator=(synchronized&& data) noexcept = default;
|
||||
|
||||
// Do not allow this data structure to be copied, std::move only.
|
||||
synchronized(const synchronized&) = delete;
|
||||
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto) rlock(FuncT&& lambda, Args&&... args) const;
|
||||
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto) wlock(FuncT&& lambda, Args&&... args);
|
||||
|
||||
// This overload to wlock allows a synchronized map whose keys map to synchronized
|
||||
// data to use a read lock on the key data and then a write lock on the mapped data.
|
||||
template <typename FuncT, typename... Args, bool EnableForMappedType = IsMappedTypeV,
|
||||
std::enable_if_t<EnableForMappedType, int> = 0>
|
||||
decltype(auto) wlock(FuncT&& lambda, Args&&... args) const;
|
||||
|
||||
// Upgradable lock. If read returns false, write will be called with a unique_lock.
|
||||
// Essentially a helper function that does .rlock() followed by .wlock().
|
||||
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
|
||||
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args);
|
||||
|
||||
private:
|
||||
mutable std::shared_mutex m_mutex = {};
|
||||
value_type m_data = {};
|
||||
};
|
||||
|
||||
//
|
||||
// member definitions
|
||||
//
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto)
|
||||
synchronized<LockedType, IsMappedTypeV>::rlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
|
||||
"function must accept const reference to locked type");
|
||||
|
||||
auto lock = std::shared_lock{ m_mutex };
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto)
|
||||
synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
|
||||
"function must accept reference to locked type");
|
||||
|
||||
auto lock = std::unique_lock{ m_mutex };
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
|
||||
// use a read lock on the key data and then a write lock on the mapped data.
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT, typename... Args, bool EnableForMappedType,
|
||||
std::enable_if_t<EnableForMappedType, int>>
|
||||
decltype(auto)
|
||||
synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Upgradable lock. If read returns false, write will be called with a unique_lock.
|
||||
// Essentially a helper function that does .rlock() followed by .wlock().
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
|
||||
bool
|
||||
synchronized<LockedType, IsMappedTypeV>::ulock(ReadFuncT&& read, WriteFuncT&& write,
|
||||
Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
|
||||
"read function must accept const reference to locked type");
|
||||
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
|
||||
"write function must accept reference to locked type");
|
||||
|
||||
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
|
||||
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
|
||||
|
||||
static_assert(std::is_same<read_return_type, write_return_type>::value,
|
||||
"read and write functions must return same type");
|
||||
static_assert(std::is_same<read_return_type, bool>::value,
|
||||
"read/write functions must return bool");
|
||||
|
||||
{
|
||||
auto lock = std::shared_lock{ m_mutex };
|
||||
if(read(m_data, std::forward<Args>(args)...)) return true;
|
||||
}
|
||||
|
||||
auto lock = std::unique_lock{ m_mutex };
|
||||
return write(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace rocprofsys
|
||||
@@ -14,6 +14,7 @@ set(core_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/mproc.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perf.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perfetto.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/state.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/timemory.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/utility.cpp)
|
||||
@@ -29,13 +30,13 @@ set(core_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/exception.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/gpu.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/hip_runtime.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/locking.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/redirect.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/state.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/timemory.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/utility.hpp)
|
||||
@@ -54,6 +55,10 @@ add_subdirectory(containers)
|
||||
target_include_directories(rocprofiler-systems-core-library BEFORE
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
target_include_directories(
|
||||
rocprofiler-systems-core-library
|
||||
PRIVATE ${PROJECT_SOURCE_DIR}/external/timemory/source/timemory/tpls/cereal)
|
||||
|
||||
target_link_libraries(rocprofiler-systems-core-library
|
||||
PRIVATE rocprofiler-systems::rocprofiler-systems-interface-library)
|
||||
target_link_libraries(
|
||||
@@ -67,8 +72,7 @@ target_link_libraries(
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-perfetto>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-timemory>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-mpi>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-hip>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm-smi>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-sanitizer>
|
||||
|
||||
@@ -222,17 +222,6 @@ init_parser(parser_data& _data)
|
||||
_data.dl_libpath = get_realpath(get_internal_libpath("librocprof-sys-dl.so").c_str());
|
||||
_data.omni_libpath = get_realpath(get_internal_libpath("librocprof-sys.so").c_str());
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
update_env(_data, "HSA_TOOLS_LIB", _data.dl_libpath);
|
||||
if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE"))
|
||||
update_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1");
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
update_env(_data, "ROCP_TOOL_LIB", _data.omni_libpath);
|
||||
if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_data, "ROCP_HSA_INTERCEPT", "1");
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_OMPT)
|
||||
if(!getenv("OMP_TOOL_LIBRARIES"))
|
||||
update_env(_data, "OMP_TOOL_LIBRARIES", _data.dl_libpath, UPD_PREPEND);
|
||||
@@ -300,15 +289,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
%{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance
|
||||
%{INDENT}% 1 do not modify how ROCm is notified about kernel completion)";
|
||||
|
||||
auto _realtime_reqs =
|
||||
(tim::get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty())
|
||||
? strvec_t{ "hsa-interrupt" }
|
||||
: strvec_t{};
|
||||
|
||||
#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0
|
||||
_realtime_reqs.clear();
|
||||
#endif
|
||||
|
||||
const auto* _trace_policy_desc =
|
||||
R"(Policy for new data when the buffer size limit is reached:
|
||||
%{INDENT}%- discard : new data is ignored
|
||||
@@ -579,45 +559,29 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
_backend_choices.erase("amd-smi");
|
||||
_backend_choices.erase("rocm-smi");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
_backend_choices.erase("roctracer");
|
||||
_backend_choices.erase("roctx");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
_backend_choices.erase("rocprofiler");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
_backend_choices.erase("rocm");
|
||||
#endif
|
||||
|
||||
if(gpu::device_count() == 0)
|
||||
{
|
||||
// remove GPU-specific backends
|
||||
_backend_choices.erase("rcclp");
|
||||
_backend_choices.erase("amd-smi");
|
||||
_backend_choices.erase("rocm-smi");
|
||||
_backend_choices.erase("roctracer");
|
||||
_backend_choices.erase("rocprofiler");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
_backend_choices.erase("rocm");
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL)
|
||||
update_env(_data, "ROCPROFSYS_USE_RCCLP", false);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
#if defined(ROCPROFSYS_USE_ROCM)
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCM_SMI", false);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCTRACER", false);
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCTX", false);
|
||||
update_env(_data, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY", false);
|
||||
update_env(_data, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY", false);
|
||||
_backend_choices.erase("roctracer");
|
||||
_backend_choices.erase("roctx");
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCPROFILER", false);
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCM", false);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -640,11 +604,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
|
||||
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
|
||||
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
|
||||
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
|
||||
@@ -676,28 +638,13 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
|
||||
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
|
||||
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
|
||||
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
|
||||
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
|
||||
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
|
||||
|
||||
if(_v.count("all") > 0 ||
|
||||
(_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0))
|
||||
{
|
||||
remove_env(_data, "HSA_TOOLS_LIB");
|
||||
remove_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE");
|
||||
}
|
||||
|
||||
if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
|
||||
{
|
||||
remove_env(_data, "ROCP_TOOL_LIB");
|
||||
remove_env(_data, "ROCP_HSA_INTERCEPT");
|
||||
}
|
||||
|
||||
if(_v.count("all") > 0 || _v.count("ompt") > 0)
|
||||
remove_env(_data, "OMP_TOOL_LIBRARIES");
|
||||
|
||||
@@ -1126,7 +1073,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_parser.add_argument({ "--sample-realtime" }, _realtime_desc)
|
||||
.min_count(0)
|
||||
.dtype("[freq] [delay] [tids...]")
|
||||
.required(std::move(_realtime_reqs))
|
||||
.action([&](parser_t& p) {
|
||||
auto _v = p.get<std::deque<std::string>>("sample-realtime");
|
||||
update_env(_data, "ROCPROFSYS_SAMPLING_REALTIME", true);
|
||||
@@ -1210,25 +1156,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_data.processed_environs.emplace("papi_events");
|
||||
}
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
if(_data.environ_filter("gpu_events", _data))
|
||||
{
|
||||
_parser
|
||||
.add_argument({ "-G", "--gpu-events" },
|
||||
"Set the GPU hardware counter events to record (ref: "
|
||||
"`rocprof-sys-avail -H -c GPU`)")
|
||||
.min_count(1)
|
||||
.dtype("[EVENT ...]")
|
||||
.action([&](parser_t& p) {
|
||||
auto _events = join(array_config_t{ "," }, p.get<strvec_t>("gpu-events"));
|
||||
update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events);
|
||||
});
|
||||
|
||||
_data.processed_environs.emplace("gpu_events");
|
||||
_data.processed_environs.emplace("rocm_events");
|
||||
}
|
||||
#endif
|
||||
|
||||
add_group_arguments(_parser, "category", _data, true);
|
||||
add_group_arguments(_parser, "io", _data, true);
|
||||
add_group_arguments(_parser, "perfetto", _data, true);
|
||||
|
||||
@@ -91,19 +91,21 @@ ROCPROFSYS_DEFINE_CATEGORY(project, rocprofsys, ROCPROFSYS_CATEGORY_NONE, "rocpr
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, host, ROCPROFSYS_CATEGORY_HOST, "host", "Host-side function tracing")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, user, ROCPROFSYS_CATEGORY_USER, "user", "User-defined regions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, python, ROCPROFSYS_CATEGORY_PYTHON, "python", "Python regions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, device_hip, ROCPROFSYS_CATEGORY_DEVICE_HIP, "device_hip", "Device-side functions submitted via HIP API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, device_hsa, ROCPROFSYS_CATEGORY_DEVICE_HSA, "device_hsa", "Device-side functions submitted via HSA API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip, ROCPROFSYS_CATEGORY_ROCM_HIP, "rocm_hip", "Host-side HIP functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa, ROCPROFSYS_CATEGORY_ROCM_HSA, "rocm_hsa", "Host-side HSA functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_roctx, ROCPROFSYS_CATEGORY_ROCM_ROCTX, "rocm_roctx", "ROCTx labels")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm, ROCPROFSYS_CATEGORY_ROCM, "rocm", "General ROCm tracing")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip_api, ROCPROFSYS_CATEGORY_ROCM_HIP_API, "rocm_hip_api", "ROCm HIP functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa_api, ROCPROFSYS_CATEGORY_ROCM_HSA_API, "rocm_hsa_api", "ROCm HSA functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_kernel_dispatch, ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, "rocm_kernel_dispatch", "ROCm Kernel dispatch")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_memory_copy, ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, "rocm_memory_copy", "ROCm Async Memory Copy")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_scratch_memory, ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, "rocm_scratch_memory", "ROCm kernel scratch memory reallocations")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_page_migration, ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, "rocm_page_migration", "ROCm memory page migration")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, "rocm_counter_collection", "ROCm device counter collection")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi, ROCPROFSYS_CATEGORY_ROCM_SMI, "rocm_smi", "rocm-smi data")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY, "device_busy", "Busy percentage of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, roctracer, ROCPROFSYS_CATEGORY_ROCTRACER, "roctracer", "Kernel tracing provided by roctracer")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocprofiler, ROCPROFSYS_CATEGORY_ROCPROFILER, "rocprofiler", "HW counter data provided by rocprofiler")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, mpi, ROCPROFSYS_CATEGORY_MPI, "mpi", "MPI regions")
|
||||
@@ -151,19 +153,21 @@ using name = perfetto_category<Tp...>;
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::user), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::python), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::sampling), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::device_hip), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::device_hsa), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_roctx), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_kernel_dispatch), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_memory_copy), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_scratch_memory), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_page_migration), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_counter_collection), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_busy), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::roctracer), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocprofiler), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::mpi), \
|
||||
|
||||
@@ -96,14 +96,6 @@ struct functors;
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_RCCL)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, category::rocm_rccl, false_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
|
||||
@@ -124,7 +116,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_cpu_clock, fa
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_percent, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, false_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type)
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#include "config.hpp"
|
||||
#include "common/defines.h"
|
||||
#include "common/static_object.hpp"
|
||||
#include "constraint.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "defines.hpp"
|
||||
@@ -29,9 +30,9 @@
|
||||
#include "mproc.hpp"
|
||||
#include "perf.hpp"
|
||||
#include "perfetto.hpp"
|
||||
#include "rocprofiler-sdk.hpp"
|
||||
#include "utility.hpp"
|
||||
|
||||
#include <asm-generic/errno-base.h>
|
||||
#include <timemory/backends/capability.hpp>
|
||||
#include <timemory/backends/dmp.hpp>
|
||||
#include <timemory/backends/mpi.hpp>
|
||||
@@ -52,6 +53,7 @@
|
||||
#include <timemory/utility/filepath.hpp>
|
||||
#include <timemory/utility/join.hpp>
|
||||
#include <timemory/utility/signals.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -60,6 +62,7 @@
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <exception>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <linux/capability.h>
|
||||
@@ -67,6 +70,7 @@
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unistd.h>
|
||||
#include <utility>
|
||||
|
||||
@@ -76,6 +80,11 @@ using settings = tim::settings;
|
||||
|
||||
namespace
|
||||
{
|
||||
int verbose_value = tim::get_env<int>("ROCPROFSYS_VERBOSE", 0, false);
|
||||
bool debug_value = tim::get_env<bool>("ROCPROFSYS_DEBUG", false, false);
|
||||
bool is_ci_value = tim::get_env<bool>("ROCPROFSYS_CI", false, false);
|
||||
auto configure_once = std::once_flag{};
|
||||
|
||||
TIMEMORY_NOINLINE bool&
|
||||
_settings_are_configured()
|
||||
{
|
||||
@@ -83,6 +92,14 @@ _settings_are_configured()
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto*&
|
||||
get_config_impl()
|
||||
{
|
||||
static auto*& _v = common::static_object<std::shared_ptr<settings>>::construct(
|
||||
common::do_not_destroy{}, settings::shared_instance());
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto
|
||||
get_config()
|
||||
{
|
||||
@@ -97,7 +114,7 @@ get_config()
|
||||
std::string
|
||||
get_setting_name(std::string _v)
|
||||
{
|
||||
static const auto _prefix = tim::string_view_t{ "rocprofsys_" };
|
||||
constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" };
|
||||
for(auto& itr : _v)
|
||||
itr = tolower(itr);
|
||||
auto _pos = _v.find(_prefix);
|
||||
@@ -195,7 +212,7 @@ configure_settings(bool _init)
|
||||
|
||||
if(settings_are_configured()) return;
|
||||
|
||||
if(get_is_continuous_integration() && get_state() < State::Init)
|
||||
if(is_ci_value && get_state() < State::Init)
|
||||
{
|
||||
timemory_print_demangled_backtrace<64>();
|
||||
ROCPROFSYS_THROW("config::configure_settings() called before "
|
||||
@@ -220,17 +237,17 @@ configure_settings(bool _init)
|
||||
tim::manager::add_metadata("ROCPROFSYS_COMPILER_VERSION",
|
||||
ROCPROFSYS_COMPILER_VERSION);
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION > 0
|
||||
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION", ROCPROFSYS_HIP_VERSION_STRING);
|
||||
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MAJOR",
|
||||
ROCPROFSYS_HIP_VERSION_MAJOR);
|
||||
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MINOR",
|
||||
ROCPROFSYS_HIP_VERSION_MINOR);
|
||||
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_PATCH",
|
||||
ROCPROFSYS_HIP_VERSION_PATCH);
|
||||
#if ROCPROFSYS_ROCM_VERSION > 0
|
||||
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION", ROCPROFSYS_ROCM_VERSION_STRING);
|
||||
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MAJOR",
|
||||
ROCPROFSYS_ROCM_VERSION_MAJOR);
|
||||
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MINOR",
|
||||
ROCPROFSYS_ROCM_VERSION_MINOR);
|
||||
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_PATCH",
|
||||
ROCPROFSYS_ROCM_VERSION_PATCH);
|
||||
#endif
|
||||
|
||||
auto _config = settings::shared_instance();
|
||||
auto _config = *get_config_impl();
|
||||
|
||||
// if using timemory, default to perfetto being off
|
||||
auto _default_perfetto_v = !tim::get_env<bool>("ROCPROFSYS_PROFILE", false, false);
|
||||
@@ -294,24 +311,15 @@ configure_settings(bool _init)
|
||||
"Enable causal profiling analysis", false, "backend",
|
||||
"causal", "analysis");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCTRACER",
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM",
|
||||
"Enable ROCm API and kernel tracing", true, "backend",
|
||||
"roctracer", "rocm");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCPROFILER",
|
||||
"Enable ROCm hardware counters", true, "backend",
|
||||
"rocprofiler", "rocm");
|
||||
"rocm");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_USE_ROCM_SMI",
|
||||
"Enable sampling GPU power, temp, utilization, and memory usage", true, "backend",
|
||||
"rocm_smi", "rocm", "process_sampling");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_USE_ROCTX",
|
||||
"Enable ROCtx API. Warning! Out-of-order ranges may corrupt perfetto flamegraph",
|
||||
false, "backend", "roctracer", "rocm", "roctx");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_SAMPLING",
|
||||
"Enable statistical sampling of call-stack", false,
|
||||
"backend", "sampling");
|
||||
@@ -616,41 +624,7 @@ configure_settings(bool _init)
|
||||
"sampling", "hardware_counters")
|
||||
->set_choices(perf::get_config_choices());
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_API",
|
||||
"Enable HIP API tracing support", true, "roctracer", "rocm",
|
||||
"advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE",
|
||||
"Enable annotating the perfetto debug annotation with backtraces", false,
|
||||
"roctracer", "rocm", "perfetto", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY",
|
||||
"Enable HIP activity tracing support", true, "roctracer",
|
||||
"rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY",
|
||||
"Enable HSA activity tracing support", false, "roctracer",
|
||||
"rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_API",
|
||||
"Enable HSA API tracing support", false, "roctracer",
|
||||
"rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCTRACER_HSA_API_TYPES",
|
||||
"HSA API type to collect", "", "roctracer", "rocm",
|
||||
"advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS",
|
||||
"Skip barrier marker events in traces", false, "roctracer",
|
||||
"rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, "ROCPROFSYS_ROCM_EVENTS",
|
||||
"ROCm hardware counters. Use ':device=N' syntax to specify collection on device "
|
||||
"number N, e.g. ':device=0'. If no device specification is provided, the event "
|
||||
"is collected on every available device",
|
||||
"", "rocprofiler", "rocm", "hardware_counters");
|
||||
rocprofiler_sdk::config_settings(_config);
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS",
|
||||
"rocm-smi metrics to collect: busy, temp, power, mem_usage",
|
||||
@@ -670,12 +644,6 @@ configure_settings(bool _init)
|
||||
"default to the value of ROCPROFSYS_COLLAPSE_PROCESSES",
|
||||
false, "perfetto", "data", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM",
|
||||
"Separate roctracer GPU side traces (copies, kernels) into separate "
|
||||
"tracks based on the stream they're enqueued into",
|
||||
true, "perfetto", "roctracer", "rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, "ROCPROFSYS_PERFETTO_FILL_POLICY",
|
||||
"Behavior when perfetto buffer is full. 'discard' will ignore new entries, "
|
||||
@@ -704,18 +672,6 @@ configure_settings(bool _init)
|
||||
"feature may dramatically reduce the size of the trace",
|
||||
true, "perfetto", "data", "debugging", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS",
|
||||
"When PERFETTO_ANNOTATIONS, USE_ROCTRACER, and ROCTRACER_HIP_API are all "
|
||||
"enabled, enabling this option will result in the arg information for HIP API "
|
||||
"calls to all be within one annotation (e.g., args=\"stream=0x0, dst=0x1F, "
|
||||
"sizeBytes=64, src=0x08, kind=1\"). When disabled, each parameter will be an "
|
||||
"individual annotation (e.g. stream, dst, sizeBytes, etc.). The benefit of the "
|
||||
"former is that it is faster to serialize and consumes less file space; the "
|
||||
"benefit of the latter is that it becomes much easier to find slices in the "
|
||||
"trace with the same value",
|
||||
false, "perfetto", "data", "debugging", "roctracer", "rocm", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
uint64_t, "ROCPROFSYS_THREAD_POOL_SIZE",
|
||||
"Max number of threads for processing background tasks",
|
||||
@@ -1045,6 +1001,10 @@ configure_settings(bool _init)
|
||||
|
||||
settings::suppress_config() = true;
|
||||
|
||||
if(auto opt = get_setting_value<int>("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt;
|
||||
if(auto opt = get_setting_value<bool>("ROCPROFSYS_DEBUG"); opt) debug_value = *opt;
|
||||
if(auto opt = get_setting_value<bool>("ROCPROFSYS_CI"); opt) is_ci_value = *opt;
|
||||
|
||||
if(get_env("ROCPROFSYS_MONOCHROME", _config->get<bool>("ROCPROFSYS_MONOCHROME")))
|
||||
tim::log::monochrome() = true;
|
||||
|
||||
@@ -1106,6 +1066,10 @@ configure_settings(bool _init)
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE(2, "configuration complete\n");
|
||||
|
||||
if(auto opt = get_setting_value<int>("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt;
|
||||
if(auto opt = get_setting_value<bool>("ROCPROFSYS_DEBUG"); opt) debug_value = *opt;
|
||||
if(auto opt = get_setting_value<bool>("ROCPROFSYS_CI"); opt) is_ci_value = *opt;
|
||||
|
||||
_settings_are_configured() = true;
|
||||
}
|
||||
|
||||
@@ -1140,8 +1104,6 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
|
||||
_set("ROCPROFSYS_PROFILE", false);
|
||||
_set("ROCPROFSYS_USE_CAUSAL", false);
|
||||
_set("ROCPROFSYS_USE_ROCM_SMI", false);
|
||||
_set("ROCPROFSYS_USE_ROCTRACER", false);
|
||||
_set("ROCPROFSYS_USE_ROCPROFILER", false);
|
||||
_set("ROCPROFSYS_USE_KOKKOSP", false);
|
||||
_set("ROCPROFSYS_USE_RCCLP", false);
|
||||
_set("ROCPROFSYS_USE_OMPT", false);
|
||||
@@ -1164,12 +1126,11 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
|
||||
|
||||
if(gpu::device_count() == 0)
|
||||
{
|
||||
#if ROCPROFSYS_HIP_VERSION > 0
|
||||
ROCPROFSYS_BASIC_VERBOSE(1, "No HIP devices were found: disabling roctracer, "
|
||||
"rocprofiler, and rocm_smi...\n");
|
||||
#if ROCPROFSYS_ROCM_VERSION > 0
|
||||
ROCPROFSYS_BASIC_VERBOSE(
|
||||
1, "No ROCm devices were found: disabling rocm and rocm_smi...\n");
|
||||
#endif
|
||||
_set("ROCPROFSYS_USE_ROCPROFILER", false);
|
||||
_set("ROCPROFSYS_USE_ROCTRACER", false);
|
||||
_set("ROCPROFSYS_USE_ROCM", false);
|
||||
_set("ROCPROFSYS_USE_ROCM_SMI", false);
|
||||
}
|
||||
|
||||
@@ -1202,9 +1163,8 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
|
||||
_set("ROCPROFSYS_USE_TRACE", false);
|
||||
_set("ROCPROFSYS_PROFILE", false);
|
||||
_set("ROCPROFSYS_USE_CAUSAL", false);
|
||||
_set("ROCPROFSYS_USE_ROCM", false);
|
||||
_set("ROCPROFSYS_USE_ROCM_SMI", false);
|
||||
_set("ROCPROFSYS_USE_ROCTRACER", false);
|
||||
_set("ROCPROFSYS_USE_ROCPROFILER", false);
|
||||
_set("ROCPROFSYS_USE_KOKKOSP", false);
|
||||
_set("ROCPROFSYS_USE_RCCLP", false);
|
||||
_set("ROCPROFSYS_USE_OMPT", false);
|
||||
@@ -1389,22 +1349,9 @@ configure_disabled_settings(const std::shared_ptr<settings>& _config)
|
||||
_handle_use_option("ROCPROFSYS_USE_OMPT", "ompt");
|
||||
_handle_use_option("ROCPROFSYS_USE_RCCLP", "rcclp");
|
||||
_handle_use_option("ROCPROFSYS_USE_ROCM_SMI", "rocm_smi");
|
||||
_handle_use_option("ROCPROFSYS_USE_ROCTRACER", "roctracer");
|
||||
_handle_use_option("ROCPROFSYS_USE_ROCPROFILER", "rocprofiler");
|
||||
_handle_use_option("ROCPROFSYS_USE_ROCM", "rocm");
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER) || ROCPROFSYS_USE_ROCTRACER == 0
|
||||
_config->find("ROCPROFSYS_USE_ROCTRACER")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("roctracer"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0
|
||||
_config->find("ROCPROFSYS_USE_ROCPROFILER")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("rocprofiler"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM_SMI) || ROCPROFSYS_USE_ROCM_SMI == 0
|
||||
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
|
||||
_config->find("ROCPROFSYS_USE_ROCM_SMI")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("rocm_smi"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
@@ -1567,7 +1514,7 @@ print_banner(std::ostream& _os)
|
||||
{ "tag", ROCPROFSYS_GIT_DESCRIBE },
|
||||
{ "", ROCPROFSYS_LIBRARY_ARCH },
|
||||
{ "compiler", ROCPROFSYS_COMPILER_STRING },
|
||||
{ "rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING } });
|
||||
{ "rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING } });
|
||||
|
||||
// <NAME> <VERSION> (<PROPERTIES>)
|
||||
if(!_properties.empty())
|
||||
@@ -1797,10 +1744,7 @@ get_debug_env()
|
||||
bool
|
||||
get_is_continuous_integration()
|
||||
{
|
||||
if(!settings_are_configured())
|
||||
return tim::get_env<bool>("ROCPROFSYS_CI", false, false);
|
||||
static auto _v = get_config()->find("ROCPROFSYS_CI");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
return is_ci_value;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1818,8 +1762,8 @@ get_debug_finalize()
|
||||
bool
|
||||
get_debug()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_DEBUG");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
std::call_once(configure_once, []() { (void) get_config(); });
|
||||
return debug_value;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1842,15 +1786,15 @@ get_verbose_env()
|
||||
int
|
||||
get_verbose()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_VERBOSE");
|
||||
return static_cast<tim::tsettings<int>&>(*_v->second).get();
|
||||
std::call_once(configure_once, []() { (void) get_config(); });
|
||||
return verbose_value;
|
||||
}
|
||||
|
||||
bool&
|
||||
get_use_perfetto()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_TRACE");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
static auto _v = get_config()->at("ROCPROFSYS_TRACE");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v).get();
|
||||
}
|
||||
|
||||
bool&
|
||||
@@ -1867,43 +1811,10 @@ get_use_causal()
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_use_roctracer()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTRACER");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
get_perfetto_roctracer_per_stream()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
get_use_rocprofiler()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCPROFILER");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
get_use_rocm_smi()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCM_SMI");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
@@ -1911,17 +1822,6 @@ get_use_rocm_smi()
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
get_use_roctx()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTX");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool&
|
||||
get_use_sampling()
|
||||
{
|
||||
@@ -2031,34 +1931,6 @@ get_sampling_cputime_signal()
|
||||
return static_cast<tim::tsettings<int>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_hip_api()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_API");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_hip_activity()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_ACTIVITY");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_hsa_api()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_API");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_hsa_activity()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_ACTIVITY");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
size_t
|
||||
get_perfetto_shmem_size_hint()
|
||||
{
|
||||
@@ -2176,14 +2048,6 @@ get_thread_pool_size()
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::string
|
||||
get_trace_hsa_api_types()
|
||||
{
|
||||
static std::string _v =
|
||||
get_config()->get<std::string>("ROCPROFSYS_ROCTRACER_HSA_API_TYPES");
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::string&
|
||||
get_perfetto_backend()
|
||||
{
|
||||
@@ -2360,7 +2224,7 @@ get_process_sampling_duration()
|
||||
std::string
|
||||
get_sampling_gpus()
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
static auto _v = get_config()->find("ROCPROFSYS_SAMPLING_GPUS");
|
||||
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
#else
|
||||
@@ -2375,13 +2239,6 @@ get_trace_thread_locks()
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
std::string
|
||||
get_rocm_events()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_ROCM_EVENTS");
|
||||
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_thread_rwlocks()
|
||||
{
|
||||
|
||||
@@ -101,17 +101,22 @@ get_exe_realpath();
|
||||
|
||||
template <typename Tp>
|
||||
bool
|
||||
set_setting_value(const std::string& _name, Tp&& _v)
|
||||
set_setting_value(const std::string& _name, Tp&& _v,
|
||||
settings::update_type _upd = settings::update_type::user)
|
||||
{
|
||||
auto _user_upd = tim::settings::update_type::user;
|
||||
auto _instance = tim::settings::shared_instance();
|
||||
auto _setting = _instance->find(_name);
|
||||
auto* _instance = tim::settings::instance();
|
||||
if(!_instance) return false;
|
||||
|
||||
auto _setting = _instance->find(_name);
|
||||
if(_setting == _instance->end()) return false;
|
||||
if(!_setting->second) return false;
|
||||
|
||||
auto& itr = _setting->second;
|
||||
auto _upd = itr->set_user_updated();
|
||||
auto _success = itr->set(std::forward<Tp>(_v), _user_upd);
|
||||
if(!_success) itr->set_updated(_upd);
|
||||
auto _old_upd = itr->get_updated_type();
|
||||
|
||||
auto _success = itr->set(std::forward<Tp>(_v), _upd);
|
||||
if(!_success) itr->set_updated(_old_upd);
|
||||
|
||||
return _success;
|
||||
}
|
||||
|
||||
@@ -119,10 +124,13 @@ template <typename Tp>
|
||||
bool
|
||||
set_default_setting_value(const std::string& _name, Tp&& _v)
|
||||
{
|
||||
auto _instance = tim::settings::shared_instance();
|
||||
auto _setting = _instance->find(_name);
|
||||
auto* _instance = tim::settings::instance();
|
||||
if(!_instance) return false;
|
||||
|
||||
auto _setting = _instance->find(_name);
|
||||
if(_setting == _instance->end()) return false;
|
||||
if(!_setting->second) return false;
|
||||
|
||||
if(_setting->second->get_config_updated() || _setting->second->get_environ_updated())
|
||||
return false;
|
||||
return _setting->second->set(std::forward<Tp>(_v));
|
||||
@@ -132,10 +140,12 @@ template <typename Tp>
|
||||
std::optional<Tp>
|
||||
get_setting_value(const std::string& _name)
|
||||
{
|
||||
auto _instance = tim::settings::shared_instance();
|
||||
if(!_instance) return std::optional<Tp>{};
|
||||
auto* _instance = tim::settings::instance();
|
||||
if(!_instance) return std::nullopt;
|
||||
|
||||
auto _setting = _instance->find(_name);
|
||||
if(_setting == _instance->end() || !_setting->second) return std::optional<Tp>{};
|
||||
|
||||
auto&& _ret = _setting->second->get<Tp>();
|
||||
return (_ret.first) ? std::optional<Tp>{ _ret.second } : std::optional<Tp>{};
|
||||
}
|
||||
@@ -194,18 +204,9 @@ get_use_timemory() ROCPROFSYS_HOT;
|
||||
bool&
|
||||
get_use_causal() ROCPROFSYS_HOT;
|
||||
|
||||
bool
|
||||
get_use_roctracer() ROCPROFSYS_HOT;
|
||||
|
||||
bool
|
||||
get_use_rocprofiler() ROCPROFSYS_HOT;
|
||||
|
||||
bool
|
||||
get_use_rocm_smi() ROCPROFSYS_HOT;
|
||||
|
||||
bool
|
||||
get_use_roctx();
|
||||
|
||||
bool&
|
||||
get_use_sampling() ROCPROFSYS_HOT;
|
||||
|
||||
@@ -236,18 +237,6 @@ get_sampling_keep_internal();
|
||||
bool
|
||||
get_use_rcclp();
|
||||
|
||||
bool
|
||||
get_trace_hip_api();
|
||||
|
||||
bool
|
||||
get_trace_hip_activity();
|
||||
|
||||
bool
|
||||
get_trace_hsa_api();
|
||||
|
||||
bool
|
||||
get_trace_hsa_activity();
|
||||
|
||||
size_t
|
||||
get_perfetto_shmem_size_hint();
|
||||
|
||||
@@ -272,9 +261,6 @@ get_perfetto_annotations() ROCPROFSYS_HOT;
|
||||
uint64_t
|
||||
get_thread_pool_size();
|
||||
|
||||
std::string
|
||||
get_trace_hsa_api_types();
|
||||
|
||||
std::string&
|
||||
get_perfetto_backend();
|
||||
|
||||
@@ -282,9 +268,6 @@ get_perfetto_backend();
|
||||
std::string
|
||||
get_perfetto_output_filename();
|
||||
|
||||
bool
|
||||
get_perfetto_roctracer_per_stream() ROCPROFSYS_HOT;
|
||||
|
||||
double
|
||||
get_trace_delay();
|
||||
|
||||
@@ -360,9 +343,6 @@ get_trace_thread_barriers();
|
||||
bool
|
||||
get_trace_thread_join();
|
||||
|
||||
std::string
|
||||
get_rocm_events();
|
||||
|
||||
bool
|
||||
get_use_tmp_files();
|
||||
|
||||
|
||||
@@ -209,7 +209,7 @@ public:
|
||||
void push_back(Tp&& t);
|
||||
|
||||
template <typename... Args>
|
||||
void emplace_back(Args&&... args);
|
||||
decltype(auto) emplace_back(Args&&... args);
|
||||
|
||||
reference operator[](size_type i);
|
||||
|
||||
@@ -229,6 +229,14 @@ private:
|
||||
storage_type m_chunks;
|
||||
};
|
||||
|
||||
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
|
||||
template <typename... Args>
|
||||
decltype(auto)
|
||||
stable_vector<Tp, ChunkSizeV, AlignN>::emplace_back(Args&&... args)
|
||||
{
|
||||
return last_chunk().emplace_back(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
|
||||
stable_vector<Tp, ChunkSizeV, AlignN>::stable_vector(size_type count, const Tp& value)
|
||||
{
|
||||
@@ -332,14 +340,6 @@ stable_vector<Tp, ChunkSizeV, AlignN>::push_back(Tp&& t)
|
||||
last_chunk().push_back(std::move(t));
|
||||
}
|
||||
|
||||
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
|
||||
template <typename... Args>
|
||||
void
|
||||
stable_vector<Tp, ChunkSizeV, AlignN>::emplace_back(Args&&... args)
|
||||
{
|
||||
last_chunk().emplace_back(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
|
||||
typename stable_vector<Tp, ChunkSizeV, AlignN>::reference
|
||||
stable_vector<Tp, ChunkSizeV, AlignN>::operator[](size_type i)
|
||||
|
||||
@@ -20,22 +20,19 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_BEGIN \
|
||||
namespace tim \
|
||||
{ \
|
||||
namespace cereal \
|
||||
{
|
||||
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_END \
|
||||
} \
|
||||
} // namespace ::tim::cereal
|
||||
|
||||
#include "common/defines.h"
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
# define ROCPROFSYS_USE_ROCM_SMI 0
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_HIP)
|
||||
# define ROCPROFSYS_USE_HIP 0
|
||||
#endif
|
||||
|
||||
#include "core/hip_runtime.hpp"
|
||||
|
||||
#if ROCPROFSYS_USE_HIP > 0
|
||||
# if !defined(TIMEMORY_USE_HIP)
|
||||
# define TIMEMORY_USE_HIP 1
|
||||
# endif
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
# define ROCPROFSYS_USE_ROCM 0
|
||||
#endif
|
||||
|
||||
#include "debug.hpp"
|
||||
@@ -44,24 +41,11 @@
|
||||
|
||||
#include <timemory/manager.hpp>
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
# include <rocm_smi/rocm_smi.h>
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_HIP > 0
|
||||
# include <timemory/components/hip/backends.hpp>
|
||||
|
||||
# if !defined(ROCPROFSYS_HIP_RUNTIME_CALL)
|
||||
# define ROCPROFSYS_HIP_RUNTIME_CALL(err) \
|
||||
{ \
|
||||
if(err != ::tim::hip::success_v && (int) err != 0) \
|
||||
{ \
|
||||
ROCPROFSYS_THROW( \
|
||||
"[%s:%d] Warning! HIP API call failed with code %i :: %s\n", \
|
||||
__FILE__, __LINE__, (int) err, hipGetErrorString(err)); \
|
||||
} \
|
||||
}
|
||||
# endif
|
||||
# include <rocprofiler-sdk/agent.h>
|
||||
# include <rocprofiler-sdk/cxx/serialization.hpp>
|
||||
# include <rocprofiler-sdk/fwd.h>
|
||||
#endif
|
||||
|
||||
namespace rocprofsys
|
||||
@@ -70,9 +54,7 @@ namespace gpu
|
||||
{
|
||||
namespace
|
||||
{
|
||||
namespace scope = ::tim::scope;
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
# define ROCPROFSYS_ROCM_SMI_CALL(ERROR_CODE) \
|
||||
::rocprofsys::gpu::check_rsmi_error(ERROR_CODE, __FILE__, __LINE__)
|
||||
|
||||
@@ -108,99 +90,47 @@ rsmi_init()
|
||||
|
||||
return _rsmi_init;
|
||||
}
|
||||
#endif
|
||||
#endif // ROCPROFSYS_USE_ROCM > 0
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION >= 60000
|
||||
template <typename ArchiveT, typename ArgT,
|
||||
std::enable_if_t<!std::is_pointer<ArgT>::value, int> = 0>
|
||||
void
|
||||
device_prop_serialize(ArchiveT& archive, const char* name, const ArgT& arg)
|
||||
int32_t
|
||||
query_rocm_gpu_agents()
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
using cereal::make_nvp;
|
||||
archive(make_nvp(name, arg));
|
||||
}
|
||||
|
||||
template <typename ArchiveT, typename ArgT, size_t N>
|
||||
void
|
||||
device_prop_serialize(ArchiveT& archive, const char* name, ArgT arg[N])
|
||||
{
|
||||
if constexpr(!std::is_same<ArgT, char>::value &&
|
||||
!std::is_same<ArgT, const char>::value)
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
using cereal::make_nvp;
|
||||
auto data = std::array<int, N>{};
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
data[i] = arg[i];
|
||||
archive(make_nvp(name, data));
|
||||
}
|
||||
else
|
||||
{
|
||||
device_prop_serialize(archive, name, std::string{ arg });
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
device_prop_serialize(ArchiveT& archive, const char* name, hipUUID_t arg)
|
||||
{
|
||||
constexpr auto N = sizeof(arg.bytes);
|
||||
namespace cereal = tim::cereal;
|
||||
using cereal::make_nvp;
|
||||
auto data = std::array<char, N + 1>{};
|
||||
data.fill('\0');
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
data[i] = arg.bytes[i];
|
||||
auto str_v = std::string_view{ data.data() };
|
||||
auto str = std::string{ str_v }.substr(0, str_v.find('\0'));
|
||||
archive(make_nvp(name, str));
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
device_prop_serialize(ArchiveT& archive, const char* name, hipDeviceArch_t arg)
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
using cereal::make_nvp;
|
||||
|
||||
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(NAME) \
|
||||
{ \
|
||||
auto val = arg.NAME; \
|
||||
archive(make_nvp(#NAME, val)); \
|
||||
int32_t _dev_cnt = 0;
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
|
||||
size_t num_agents, void* user_data) -> rocprofiler_status_t {
|
||||
auto* _cnt = static_cast<int32_t*>(user_data);
|
||||
for(size_t i = 0; i < num_agents; ++i)
|
||||
{
|
||||
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
|
||||
if(_agent && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) *_cnt += 1;
|
||||
}
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
archive.setNextName(name);
|
||||
archive.startNode();
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt32Atomics)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalFloatAtomicExch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt32Atomics)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedFloatAtomicExch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFloatAtomicAdd)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt64Atomics)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt64Atomics)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDoubles)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpVote)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpBallot)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpShuffle)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFunnelShift)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasThreadFenceSystem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSyncThreadsExt)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSurfaceFuncs)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(has3dGrid)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDynamicParallelism)
|
||||
archive.finishNode();
|
||||
|
||||
# undef ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH
|
||||
}
|
||||
try
|
||||
{
|
||||
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
|
||||
sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
|
||||
} catch(std::exception& _e)
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE(
|
||||
1, "Exception thrown getting the rocm agents: %s. _dev_cnt=%d\n", _e.what(),
|
||||
_dev_cnt);
|
||||
}
|
||||
// rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
|
||||
// sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
|
||||
#endif
|
||||
return _dev_cnt;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int
|
||||
hip_device_count()
|
||||
rocm_device_count()
|
||||
{
|
||||
#if ROCPROFSYS_USE_HIP > 0
|
||||
return ::tim::hip::device_count();
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
static int _num_devices = query_rocm_gpu_agents();
|
||||
return _num_devices;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -209,7 +139,7 @@ hip_device_count()
|
||||
int
|
||||
rsmi_device_count()
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
if(!rsmi_init()) return 0;
|
||||
|
||||
static auto _num_devices = []() {
|
||||
@@ -234,11 +164,8 @@ rsmi_device_count()
|
||||
int
|
||||
device_count()
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
// store as static since calls after rsmi_shutdown will return zero
|
||||
return rsmi_device_count();
|
||||
#elif ROCPROFSYS_USE_HIP > 0
|
||||
return ::tim::hip::device_count();
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
return rocm_device_count();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -246,251 +173,44 @@ device_count()
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
add_hip_device_metadata(ArchiveT& ar)
|
||||
add_device_metadata(ArchiveT& ar)
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
using cereal::make_nvp;
|
||||
|
||||
#if ROCPROFSYS_USE_HIP > 0
|
||||
int _device_count = 0;
|
||||
int _current_device = 0;
|
||||
hipError_t _device_count_err = hipGetDeviceCount(&_device_count);
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
|
||||
|
||||
if(_device_count_err != hipSuccess) return;
|
||||
|
||||
hipError_t _current_device_err = hipGetDevice(&_current_device);
|
||||
|
||||
scope::destructor _dtor{ [_current_device, _current_device_err]() {
|
||||
if(_current_device_err == hipSuccess)
|
||||
auto _agents_vec = agent_vec_t{};
|
||||
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
|
||||
size_t num_agents, void* user_data) -> rocprofiler_status_t {
|
||||
auto* _agents_vec_v = static_cast<agent_vec_t*>(user_data);
|
||||
_agents_vec_v->reserve(num_agents);
|
||||
for(size_t i = 0; i < num_agents; ++i)
|
||||
{
|
||||
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(_current_device));
|
||||
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
|
||||
if(_agent) _agents_vec_v->emplace_back(*_agent);
|
||||
}
|
||||
} };
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
};
|
||||
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
|
||||
sizeof(rocprofiler_agent_v0_t), &_agents_vec);
|
||||
|
||||
if(_current_device_err != hipSuccess || _device_count == 0) return;
|
||||
|
||||
ar.setNextName("hip_device_properties");
|
||||
ar.startNode();
|
||||
ar.makeArray();
|
||||
|
||||
scope::destructor _prop_dtor{ [&ar]() { ar.finishNode(); } };
|
||||
for(int dev = 0; dev < _device_count; ++dev)
|
||||
{
|
||||
auto _device_prop = hipDeviceProp_t{};
|
||||
int _driver_version = 0;
|
||||
int _runtime_version = 0;
|
||||
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(dev));
|
||||
ROCPROFSYS_HIP_RUNTIME_CALL(hipGetDeviceProperties(&_device_prop, dev));
|
||||
ROCPROFSYS_HIP_RUNTIME_CALL(hipDriverGetVersion(&_driver_version));
|
||||
ROCPROFSYS_HIP_RUNTIME_CALL(hipRuntimeGetVersion(&_runtime_version));
|
||||
|
||||
ar.startNode();
|
||||
|
||||
# if ROCPROFSYS_HIP_VERSION < 60000
|
||||
using intvec_t = std::vector<int>;
|
||||
|
||||
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
|
||||
ar(make_nvp(#NAME, _device_prop.NAME));
|
||||
|
||||
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \
|
||||
ar(make_nvp(NAME, __VA_ARGS__));
|
||||
|
||||
ar(make_nvp("name", std::string{ _device_prop.name }));
|
||||
ar(make_nvp("driver_version", _driver_version));
|
||||
ar(make_nvp("runtime_version", _runtime_version));
|
||||
ar(make_nvp("capability.major_version", _device_prop.major));
|
||||
ar(make_nvp("capability.minor_version", _device_prop.minor));
|
||||
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
|
||||
|
||||
# if ROCPROFSYS_HIP_VERSION >= 50000
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
|
||||
# endif
|
||||
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
|
||||
"maxThreadsDim",
|
||||
intvec_t{ _device_prop.maxThreadsDim[0], _device_prop.maxThreadsDim[1],
|
||||
_device_prop.maxThreadsDim[2] })
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
|
||||
"maxGridSize",
|
||||
intvec_t{ _device_prop.maxGridSize[0], _device_prop.maxGridSize[1],
|
||||
_device_prop.maxGridSize[2] })
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
|
||||
# else
|
||||
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
|
||||
device_prop_serialize(ar, #NAME, _device_prop.NAME);
|
||||
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(name)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(uuid)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luid)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luidDeviceNodeMask)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsDim)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxGridSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(major)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(minor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(texturePitchAlignment)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deviceOverlap)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DMipmap)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLinear)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DMipmap)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLinear)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DGather)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3DAlt)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemap)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemapLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface3D)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1DLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2DLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemap)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemapLayered)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(surfaceAlignment)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(tccDriver)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asyncEngineCount)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedAddressing)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(persistingL2CacheMaxSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(streamPrioritiesSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(globalL1CacheSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(localL1CacheSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerMultiprocessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerMultiprocessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(managedMemory)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiGpuBoardGroupID)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostNativeAtomicSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(singleToDoublePrecisionPerfRatio)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computePreemptionSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canUseHostPointerForRegisteredMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlockOptin)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxBlocksPerMultiProcessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(accessPolicyMaxWindowSize)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(reservedSharedMemPerBlock)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sparseHipArraySupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterReadOnlySupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(timelineSemaphoreInteropSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolsSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMASupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAFlushWritesOptions)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAWritesOrdering)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolSupportedHandleTypes)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deferredMappingHipArraySupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ipcEventSupported)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clusterLaunch)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedFunctionPointers)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(arch)
|
||||
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpMemFlushCntl)
|
||||
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpRegFlushCntl)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedFunc)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedGridDim)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedBlockDim)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedSharedMem)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isLargeBar)
|
||||
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
|
||||
# endif
|
||||
|
||||
const auto _compute_mode_descr = std::array<const char*, 6>{
|
||||
"Default (multiple host threads can use ::hipSetDevice() with device "
|
||||
"simultaneously)",
|
||||
"Exclusive (only one host thread in one process is able to use "
|
||||
"::hipSetDevice() with this device)",
|
||||
"Prohibited (no host thread can use ::hipSetDevice() with this device)",
|
||||
"Exclusive Process (many threads in one process is able to use "
|
||||
"::hipSetDevice() with this device)",
|
||||
"Unknown",
|
||||
nullptr
|
||||
};
|
||||
|
||||
auto _compute_mode = std::min<int>(_device_prop.computeMode, 5);
|
||||
ar(make_nvp("computeModeDescription",
|
||||
std::string{ _compute_mode_descr.at(_compute_mode) }));
|
||||
|
||||
ar.finishNode();
|
||||
}
|
||||
ar(make_nvp("rocm_agents", _agents_vec));
|
||||
#else
|
||||
(void) ar;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
add_hip_device_metadata()
|
||||
add_device_metadata()
|
||||
{
|
||||
if(device_count() == 0) return;
|
||||
|
||||
ROCPROFSYS_METADATA([](auto& ar) {
|
||||
try
|
||||
{
|
||||
add_hip_device_metadata(ar);
|
||||
add_device_metadata(ar);
|
||||
} catch(std::runtime_error& _e)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(2, "%s\n", _e.what());
|
||||
|
||||
@@ -30,12 +30,12 @@ int
|
||||
device_count();
|
||||
|
||||
int
|
||||
hip_device_count();
|
||||
rocm_device_count();
|
||||
|
||||
int
|
||||
rsmi_device_count();
|
||||
|
||||
void
|
||||
add_hip_device_metadata();
|
||||
add_device_metadata();
|
||||
} // namespace gpu
|
||||
} // namespace rocprofsys
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include "core/defines.hpp"
|
||||
|
||||
#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
|
||||
# if defined(HIP_INCLUDE_HIP_HIP_RUNTIME_H) || \
|
||||
defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H)
|
||||
@@ -35,22 +35,17 @@
|
||||
# define HIP_PROF_HIP_API_STRING 1
|
||||
|
||||
// following must be included before <roctracer_hip.h> for ROCm 6.0+
|
||||
# if ROCPROFSYS_HIP_VERSION >= 60000
|
||||
# if defined(USE_PROF_API)
|
||||
# undef USE_PROF_API
|
||||
# endif
|
||||
# include <hip/hip_runtime.h>
|
||||
# include <hip/hip_runtime_api.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <hip/hip_deprecated.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <hip_ostream_ops.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <hip/amd_detail/hip_prof_str.h>
|
||||
# else
|
||||
# include <hip/hip_runtime.h>
|
||||
# include <hip/hip_runtime_api.h>
|
||||
# if defined(USE_PROF_API)
|
||||
# undef USE_PROF_API
|
||||
# endif
|
||||
# include <hip/hip_runtime.h>
|
||||
# include <hip/hip_runtime_api.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <hip/hip_deprecated.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <roctracer/hip_ostream_ops.h>
|
||||
// must be included after hip_runtime_api.h
|
||||
# include <hip/amd_detail/hip_prof_str.h>
|
||||
|
||||
# include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
@@ -104,6 +104,7 @@ perfetto_counter_track<Tp>::emplace(size_t _idx, const std::string& _v,
|
||||
for(const auto& itr : _name_data)
|
||||
{
|
||||
_missing.emplace_back(std::make_tuple(*itr, itr->c_str(), false));
|
||||
// TODO: _missing.emplace_back(*itr, itr->c_str(), false);
|
||||
}
|
||||
}
|
||||
auto _index = _track_data.size();
|
||||
|
||||
@@ -23,13 +23,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "core/defines.hpp"
|
||||
#include "core/hip_runtime.hpp"
|
||||
|
||||
#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0 && \
|
||||
defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
|
||||
# if ROCPROFSYS_HIP_VERSION == 0 || ROCPROFSYS_HIP_VERSION >= 50200
|
||||
# include <rccl/rccl.h>
|
||||
# else
|
||||
# include <rccl.h>
|
||||
# endif
|
||||
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
|
||||
# include <rccl/rccl.h>
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,576 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "core/rocprofiler-sdk.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "timemory.hpp"
|
||||
#include <regex>
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
|
||||
# include <timemory/defines.h>
|
||||
# include <timemory/utility/demangle.hpp>
|
||||
|
||||
# include <rocprofiler-sdk/agent.h>
|
||||
# include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
# include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
# include <algorithm>
|
||||
# include <cstdint>
|
||||
# include <set>
|
||||
# include <sstream>
|
||||
# include <string>
|
||||
# include <unordered_set>
|
||||
# include <vector>
|
||||
|
||||
# define ROCPROFILER_CALL(result) \
|
||||
{ \
|
||||
rocprofiler_status_t CHECKSTATUS = (result); \
|
||||
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
|
||||
{ \
|
||||
auto msg = std::stringstream{}; \
|
||||
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
|
||||
msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \
|
||||
<< "rocprofiler-sdk call [" << #result \
|
||||
<< "] failed with error code " << CHECKSTATUS \
|
||||
<< " :: " << status_msg; \
|
||||
ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
namespace
|
||||
{
|
||||
std::string
|
||||
get_setting_name(std::string _v)
|
||||
{
|
||||
constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" };
|
||||
for(auto& itr : _v)
|
||||
itr = tolower(itr);
|
||||
auto _pos = _v.find(_prefix);
|
||||
if(_pos == 0) return _v.substr(_prefix.length());
|
||||
return _v;
|
||||
}
|
||||
|
||||
# define ROCPROFSYS_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \
|
||||
[&]() { \
|
||||
auto _ret = _config->insert<TYPE, TYPE>( \
|
||||
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, \
|
||||
TYPE{ INITIAL_VALUE }, \
|
||||
std::set<std::string>{ "custom", "rocprofsys", "librocprof-sys", \
|
||||
__VA_ARGS__ }); \
|
||||
if(!_ret.second) \
|
||||
{ \
|
||||
ROCPROFSYS_PRINT("Warning! Duplicate setting: %s / %s\n", \
|
||||
get_setting_name(ENV_NAME).c_str(), ENV_NAME); \
|
||||
} \
|
||||
return _config->find(ENV_NAME)->second; \
|
||||
}()
|
||||
|
||||
template <typename Tp>
|
||||
std::string
|
||||
to_lower(const Tp& _val)
|
||||
{
|
||||
auto _v = std::string{ _val };
|
||||
for(auto& itr : _v)
|
||||
itr = ::tolower(itr);
|
||||
return _v;
|
||||
}
|
||||
|
||||
struct operation_options
|
||||
{
|
||||
std::string operations_include = {};
|
||||
std::string operations_exclude = {};
|
||||
std::string operations_annotate_backtrace = {};
|
||||
};
|
||||
|
||||
auto callback_operation_option_names =
|
||||
std::unordered_map<rocprofiler_callback_tracing_kind_t, operation_options>{};
|
||||
auto buffered_operation_option_names =
|
||||
std::unordered_map<rocprofiler_buffer_tracing_kind_t, operation_options>{};
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_operations_impl(rocprofiler_callback_tracing_kind_t kindv,
|
||||
const std::string& optname = {})
|
||||
{
|
||||
static const auto callback_tracing_info =
|
||||
rocprofiler::sdk::get_callback_tracing_names();
|
||||
|
||||
if(optname.empty())
|
||||
{
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
for(auto iitr : callback_tracing_info[kindv].items())
|
||||
{
|
||||
if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first);
|
||||
}
|
||||
return _ret;
|
||||
}
|
||||
|
||||
auto _val = get_setting_value<std::string>(optname);
|
||||
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str());
|
||||
|
||||
if(_val->empty()) return std::unordered_set<int32_t>{};
|
||||
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
for(const auto& itr : tim::delimit(*_val, " ,;:\n\t"))
|
||||
{
|
||||
for(auto iitr : callback_tracing_info[kindv].items())
|
||||
{
|
||||
auto _re = std::regex{ itr, std::regex_constants::icase };
|
||||
if(iitr.second && std::regex_search(iitr.second->data(), _re))
|
||||
{
|
||||
ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(),
|
||||
itr.c_str(), iitr.second->data());
|
||||
_ret.emplace(iitr.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _ret;
|
||||
}
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_operations_impl(rocprofiler_buffer_tracing_kind_t kindv,
|
||||
const std::string& optname = {})
|
||||
{
|
||||
static const auto buffered_tracing_info =
|
||||
rocprofiler::sdk::get_buffer_tracing_names();
|
||||
|
||||
if(optname.empty())
|
||||
{
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
for(auto iitr : buffered_tracing_info[kindv].items())
|
||||
{
|
||||
if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first);
|
||||
}
|
||||
return _ret;
|
||||
}
|
||||
|
||||
auto _val = get_setting_value<std::string>(optname);
|
||||
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str());
|
||||
|
||||
if(_val->empty()) return std::unordered_set<int32_t>{};
|
||||
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
for(const auto& itr : tim::delimit(*_val, " ,;:\n\t"))
|
||||
{
|
||||
for(auto iitr : buffered_tracing_info[kindv].items())
|
||||
{
|
||||
auto _re = std::regex{ itr, std::regex_constants::icase };
|
||||
if(iitr.second && std::regex_search(iitr.second->data(), _re))
|
||||
{
|
||||
ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(),
|
||||
itr.c_str(), iitr.second->data());
|
||||
_ret.emplace(iitr.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
return _ret;
|
||||
}
|
||||
|
||||
std::vector<int32_t>
|
||||
get_operations_impl(const std::unordered_set<int32_t>& _complete,
|
||||
const std::unordered_set<int32_t>& _include,
|
||||
const std::unordered_set<int32_t>& _exclude)
|
||||
{
|
||||
auto _convert = [](const auto& _dset) {
|
||||
auto _dret = std::vector<int32_t>{};
|
||||
_dret.reserve(_dset.size());
|
||||
for(auto itr : _dset)
|
||||
_dret.emplace_back(itr);
|
||||
std::sort(_dret.begin(), _dret.end());
|
||||
return _dret;
|
||||
};
|
||||
|
||||
if(_include.empty() && _exclude.empty()) return _convert(_complete);
|
||||
|
||||
auto _ret = (_include.empty()) ? _complete : _include;
|
||||
for(auto itr : _exclude)
|
||||
_ret.erase(itr);
|
||||
|
||||
return _convert(_ret);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
config_settings(const std::shared_ptr<settings>& _config)
|
||||
{
|
||||
// const auto agents = std::vector<rocprofiler_agent_t>{};
|
||||
const auto buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
|
||||
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
|
||||
|
||||
auto _skip_domains =
|
||||
std::unordered_set<std::string_view>{ "none",
|
||||
"correlation_id_retirement",
|
||||
"marker_core_api",
|
||||
"marker_control_api",
|
||||
"marker_name_api",
|
||||
"code_object" };
|
||||
|
||||
auto _domain_choices = std::vector<std::string>{};
|
||||
auto _add_domain = [&_domain_choices, &_skip_domains](std::string_view _domain) {
|
||||
auto _v = to_lower(_domain);
|
||||
|
||||
if(_skip_domains.count(_v) == 0)
|
||||
{
|
||||
auto itr = std::find(_domain_choices.begin(), _domain_choices.end(), _v);
|
||||
if(itr == _domain_choices.end()) _domain_choices.emplace_back(_v);
|
||||
}
|
||||
};
|
||||
|
||||
static auto _option_names = std::unordered_set<std::string>{};
|
||||
auto _add_operation_settings = [&_config, &_skip_domains](
|
||||
std::string_view _domain_name, const auto& _domain,
|
||||
auto& _operation_option_names) {
|
||||
auto _v = to_lower(_domain_name);
|
||||
|
||||
if(_skip_domains.count(_v) > 0) return;
|
||||
|
||||
auto _op_option_name = JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS");
|
||||
auto _eop_option_name =
|
||||
JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_EXCLUDE");
|
||||
auto _bt_option_name =
|
||||
JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_ANNOTATE_BACKTRACE");
|
||||
|
||||
auto _op_choices = std::vector<std::string>{};
|
||||
for(auto itr : _domain.operations)
|
||||
_op_choices.emplace_back(std::string{ itr });
|
||||
|
||||
if(_op_choices.empty()) return;
|
||||
|
||||
_operation_option_names.emplace(
|
||||
_domain.value,
|
||||
operation_options{ _op_option_name, _eop_option_name, _bt_option_name });
|
||||
|
||||
if(_option_names.emplace(_op_option_name).second)
|
||||
{
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, _op_option_name.c_str(),
|
||||
"Inclusive filter for domain operations (for API domains, this selects "
|
||||
"the functions to trace) [regex supported]",
|
||||
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
|
||||
->set_choices(_op_choices);
|
||||
}
|
||||
|
||||
if(_option_names.emplace(_eop_option_name).second)
|
||||
{
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, _eop_option_name.c_str(),
|
||||
"Exclusive filter for domain operations applied after the inclusive "
|
||||
"filter (for API domains, removes function from trace) [regex supported]",
|
||||
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
|
||||
->set_choices(_op_choices);
|
||||
}
|
||||
|
||||
if(_option_names.emplace(_bt_option_name).second)
|
||||
{
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, _bt_option_name.c_str(),
|
||||
"Specification of domain operations which will record a backtrace (for "
|
||||
"API domains, this is a list of function names) [regex supported]",
|
||||
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
|
||||
->set_choices(_op_choices);
|
||||
}
|
||||
};
|
||||
|
||||
_domain_choices.reserve(buffered_tracing_info.size());
|
||||
_domain_choices.reserve(callback_tracing_info.size());
|
||||
_add_domain("hip_api");
|
||||
_add_domain("hsa_api");
|
||||
_add_domain("marker_api");
|
||||
|
||||
for(const auto& itr : buffered_tracing_info)
|
||||
_add_domain(itr.name);
|
||||
|
||||
for(const auto& itr : callback_tracing_info)
|
||||
_add_domain(itr.name);
|
||||
|
||||
std::sort(_domain_choices.begin(), _domain_choices.end());
|
||||
|
||||
namespace join = ::timemory::join;
|
||||
auto _domain_description =
|
||||
JOIN("", "Specification of ROCm domains to trace/profile. Choices: ",
|
||||
join::join(join::array_config{ ", ", "", "" }, _domain_choices));
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_DOMAINS", _domain_description,
|
||||
std::string{ "hip_runtime_api,marker_api,kernel_dispatch,"
|
||||
"memory_copy,scratch_memory,page_migration" },
|
||||
"rocm", "rocprofiler-sdk")
|
||||
->set_choices(_domain_choices);
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, "ROCPROFSYS_ROCM_EVENTS",
|
||||
"ROCm hardware counters. Use ':device=N' syntax to specify collection on device "
|
||||
"number N, e.g. ':device=0'. If no device specification is provided, the event "
|
||||
"is collected on every available device",
|
||||
"", "rocm", "hardware_counters");
|
||||
|
||||
_skip_domains.emplace("kernel_dispatch");
|
||||
_skip_domains.emplace("page_migration");
|
||||
_skip_domains.emplace("scratch_memory");
|
||||
|
||||
_add_operation_settings(
|
||||
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
|
||||
callback_operation_option_names);
|
||||
|
||||
for(const auto& itr : callback_tracing_info)
|
||||
_add_operation_settings(itr.name, itr, callback_operation_option_names);
|
||||
|
||||
for(const auto& itr : buffered_tracing_info)
|
||||
_add_operation_settings(itr.name, itr, buffered_operation_option_names);
|
||||
}
|
||||
|
||||
std::unordered_set<rocprofiler_callback_tracing_kind_t>
|
||||
get_callback_domains()
|
||||
{
|
||||
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
|
||||
const auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
};
|
||||
|
||||
auto _data = std::unordered_set<rocprofiler_callback_tracing_kind_t>{};
|
||||
auto _domains =
|
||||
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
|
||||
.value_or(std::string{}),
|
||||
" ,;:\t\n");
|
||||
|
||||
const auto valid_choices =
|
||||
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
|
||||
|
||||
auto invalid_domain = [&valid_choices](const auto& domainv) {
|
||||
return !std::any_of(valid_choices.begin(), valid_choices.end(),
|
||||
[&domainv](const auto& aitr) { return (aitr == domainv); });
|
||||
};
|
||||
|
||||
for(const auto& itr : _domains)
|
||||
{
|
||||
if(invalid_domain(itr))
|
||||
{
|
||||
ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n",
|
||||
itr.c_str());
|
||||
}
|
||||
|
||||
if(itr == "hsa_api")
|
||||
{
|
||||
for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API })
|
||||
_data.emplace(eitr);
|
||||
}
|
||||
else if(itr == "hip_api")
|
||||
{
|
||||
for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API })
|
||||
_data.emplace(eitr);
|
||||
}
|
||||
else if(itr == "marker_api" || itr == "roctx")
|
||||
{
|
||||
_data.emplace(ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(size_t idx = 0; idx < callback_tracing_info.size(); ++idx)
|
||||
{
|
||||
auto ditr = callback_tracing_info[idx];
|
||||
auto dval = static_cast<rocprofiler_callback_tracing_kind_t>(idx);
|
||||
if(itr == to_lower(ditr.name) && supported.count(dval) > 0)
|
||||
{
|
||||
_data.emplace(dval);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::unordered_set<rocprofiler_buffer_tracing_kind_t>
|
||||
get_buffered_domains()
|
||||
{
|
||||
const auto buffer_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
|
||||
const auto supported = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{
|
||||
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
|
||||
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
};
|
||||
|
||||
auto _data = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{};
|
||||
auto _domains =
|
||||
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
|
||||
.value_or(std::string{}),
|
||||
" ,;:\t\n");
|
||||
const auto valid_choices =
|
||||
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
|
||||
|
||||
auto invalid_domain = [&valid_choices](const auto& domainv) {
|
||||
return !std::any_of(valid_choices.begin(), valid_choices.end(),
|
||||
[&domainv](const auto& aitr) { return (aitr == domainv); });
|
||||
};
|
||||
|
||||
for(const auto& itr : _domains)
|
||||
{
|
||||
if(invalid_domain(itr))
|
||||
{
|
||||
ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n",
|
||||
itr.c_str());
|
||||
}
|
||||
|
||||
if(itr == "hsa_api")
|
||||
{
|
||||
for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API })
|
||||
_data.emplace(eitr);
|
||||
}
|
||||
else if(itr == "hip_api")
|
||||
{
|
||||
for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API })
|
||||
_data.emplace(eitr);
|
||||
}
|
||||
else if(itr == "marker_api" || itr == "roctx")
|
||||
{
|
||||
_data.emplace(ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(size_t idx = 0; idx < buffer_tracing_info.size(); ++idx)
|
||||
{
|
||||
auto ditr = buffer_tracing_info[idx];
|
||||
auto dval = static_cast<rocprofiler_buffer_tracing_kind_t>(idx);
|
||||
if(itr == to_lower(ditr.name) && supported.count(dval) > 0)
|
||||
{
|
||||
_data.emplace(dval);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
get_rocm_events()
|
||||
{
|
||||
return tim::delimit(
|
||||
get_setting_value<std::string>("ROCPROFSYS_ROCM_EVENTS").value_or(std::string{}),
|
||||
" ,;\t\n");
|
||||
}
|
||||
|
||||
std::vector<int32_t>
|
||||
get_operations(rocprofiler_callback_tracing_kind_t kindv)
|
||||
{
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(
|
||||
callback_operation_option_names.count(kindv) == 0,
|
||||
"callback_operation_operation_names does not have value for %i\n", kindv);
|
||||
|
||||
auto _complete = get_operations_impl(kindv);
|
||||
auto _include = get_operations_impl(
|
||||
kindv, callback_operation_option_names.at(kindv).operations_include);
|
||||
auto _exclude = get_operations_impl(
|
||||
kindv, callback_operation_option_names.at(kindv).operations_exclude);
|
||||
|
||||
return get_operations_impl(_complete, _include, _exclude);
|
||||
}
|
||||
|
||||
std::vector<int32_t>
|
||||
get_operations(rocprofiler_buffer_tracing_kind_t kindv)
|
||||
{
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(
|
||||
buffered_operation_option_names.count(kindv) == 0,
|
||||
"buffered_operation_option_names does not have value for %i\n", kindv);
|
||||
|
||||
auto _complete = get_operations_impl(kindv);
|
||||
auto _include = get_operations_impl(
|
||||
kindv, buffered_operation_option_names.at(kindv).operations_include);
|
||||
auto _exclude = get_operations_impl(
|
||||
kindv, buffered_operation_option_names.at(kindv).operations_exclude);
|
||||
|
||||
return get_operations_impl(_complete, _include, _exclude);
|
||||
}
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
|
||||
{
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(
|
||||
callback_operation_option_names.count(kindv) == 0,
|
||||
"callback_operation_operation_names does not have value for %i\n", kindv);
|
||||
|
||||
auto _data = get_operations_impl(
|
||||
kindv, callback_operation_option_names.at(kindv).operations_annotate_backtrace);
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
_ret.reserve(_data.size());
|
||||
for(auto itr : _data)
|
||||
_ret.emplace(itr);
|
||||
return _ret;
|
||||
}
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv)
|
||||
{
|
||||
ROCPROFSYS_CONDITIONAL_ABORT_F(
|
||||
buffered_operation_option_names.count(kindv) == 0,
|
||||
"buffered_operation_option_names does not have value for %i\n", kindv);
|
||||
|
||||
auto _data = get_operations_impl(
|
||||
kindv, buffered_operation_option_names.at(kindv).operations_annotate_backtrace);
|
||||
auto _ret = std::unordered_set<int32_t>{};
|
||||
_ret.reserve(_data.size());
|
||||
for(auto itr : _data)
|
||||
_ret.emplace(itr);
|
||||
return _ret;
|
||||
}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
|
||||
#else
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
void
|
||||
config_settings(const std::shared_ptr<settings>&)
|
||||
{}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,70 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/timemory.hpp"
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM)
|
||||
# include <rocprofiler-sdk/fwd.h>
|
||||
# include <rocprofiler-sdk/rocprofiler.h>
|
||||
#endif
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
void
|
||||
config_settings(const std::shared_ptr<settings>&);
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM)
|
||||
|
||||
std::unordered_set<rocprofiler_callback_tracing_kind_t>
|
||||
get_callback_domains();
|
||||
|
||||
std::unordered_set<rocprofiler_buffer_tracing_kind_t>
|
||||
get_buffered_domains();
|
||||
|
||||
std::vector<int32_t>
|
||||
get_operations(rocprofiler_callback_tracing_kind_t kindv);
|
||||
|
||||
std::vector<int32_t>
|
||||
get_operations(rocprofiler_buffer_tracing_kind_t kindv);
|
||||
|
||||
std::vector<std::string>
|
||||
get_rocm_events();
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv);
|
||||
|
||||
std::unordered_set<int32_t>
|
||||
get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv);
|
||||
|
||||
#endif
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
@@ -21,6 +21,7 @@
|
||||
// SOFTWARE.
|
||||
|
||||
#include "state.hpp"
|
||||
#include "common/static_object.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "utility.hpp"
|
||||
@@ -35,8 +36,9 @@ namespace
|
||||
auto&
|
||||
get_state_value()
|
||||
{
|
||||
static auto _v = std::atomic<State>{ State::PreInit };
|
||||
return _v;
|
||||
static auto*& _v = common::static_object<std::atomic<State>>::construct(
|
||||
common::do_not_destroy{}, State::PreInit);
|
||||
return *_v;
|
||||
}
|
||||
|
||||
ThreadState&
|
||||
|
||||
@@ -74,6 +74,15 @@ get_reserved_vector(size_t _n)
|
||||
return _v;
|
||||
}
|
||||
|
||||
/// returns a vector with a preallocated buffer
|
||||
template <typename... Tp>
|
||||
inline decltype(auto)
|
||||
get_reserved_vector(std::vector<Tp...>&& _v, size_t _n)
|
||||
{
|
||||
_v.reserve(_n);
|
||||
return std::forward<std::vector<Tp...>>(_v);
|
||||
}
|
||||
|
||||
template <typename Tp, size_t Offset>
|
||||
struct offset_index_sequence;
|
||||
|
||||
|
||||
@@ -25,7 +25,8 @@ target_include_directories(
|
||||
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocprof-sys-user>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/../rocprof-sys>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
|
||||
PRIVATE ${rocprofiler-sdk_INCLUDE_DIR})
|
||||
target_link_libraries(
|
||||
rocprofiler-systems-dl-library
|
||||
PUBLIC $<BUILD_INTERFACE:${dl_LIBRARY}>
|
||||
|
||||
@@ -54,6 +54,14 @@
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
# define ROCPROFSYS_USE_ROCM 0
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
# include <rocprofiler-sdk/registration.h>
|
||||
#endif
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define ROCPROFSYS_DLSYM(VARNAME, HANDLE, FUNCNAME) \
|
||||
@@ -79,6 +87,7 @@
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
using main_func_t = int (*)(int, char**, char**);
|
||||
using init_func_t = void (*)(void);
|
||||
|
||||
std::ostream&
|
||||
operator<<(std::ostream& _os, const SpaceHandle& _handle)
|
||||
@@ -360,14 +369,8 @@ struct ROCPROFSYS_INTERNAL_API indirect
|
||||
ROCPROFSYS_DLSYM(kokkosp_dual_view_modify_f, m_omnihandle,
|
||||
"kokkosp_dual_view_modify");
|
||||
|
||||
#if ROCPROFSYS_USE_ROCTRACER > 0
|
||||
ROCPROFSYS_DLSYM(hsa_on_load_f, m_omnihandle, "OnLoad");
|
||||
ROCPROFSYS_DLSYM(hsa_on_unload_f, m_omnihandle, "OnUnload");
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
ROCPROFSYS_DLSYM(rocp_on_load_tool_prop_f, m_omnihandle, "OnLoadToolProp");
|
||||
ROCPROFSYS_DLSYM(rocp_on_unload_tool_f, m_omnihandle, "OnUnloadTool");
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
ROCPROFSYS_DLSYM(rocprofiler_configure_f, m_omnihandle, "rocprofiler_configure");
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_OMPT == 0
|
||||
@@ -460,16 +463,9 @@ public:
|
||||
void (*kokkosp_dual_view_sync_f)(const char*, const void* const, bool) = nullptr;
|
||||
void (*kokkosp_dual_view_modify_f)(const char*, const void* const, bool) = nullptr;
|
||||
|
||||
// HSA functions
|
||||
#if ROCPROFSYS_USE_ROCTRACER > 0
|
||||
bool (*hsa_on_load_f)(HsaApiTable*, uint64_t, uint64_t, const char* const*) = nullptr;
|
||||
void (*hsa_on_unload_f)() = nullptr;
|
||||
#endif
|
||||
|
||||
// ROCP functions
|
||||
#if ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
void (*rocp_on_load_tool_prop_f)(void* settings) = nullptr;
|
||||
void (*rocp_on_unload_tool_f)() = nullptr;
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
rocprofiler_tool_configure_result_t* (*rocprofiler_configure_f)(
|
||||
uint32_t, const char*, uint32_t, rocprofiler_client_id_t*) = nullptr;
|
||||
#endif
|
||||
|
||||
// OpenMP functions
|
||||
@@ -644,13 +640,18 @@ extern "C"
|
||||
|
||||
bool _invoked = false;
|
||||
ROCPROFSYS_DL_INVOKE_STATUS(_invoked, get_indirect().rocprofsys_init_f, a, b, c);
|
||||
|
||||
if(_invoked)
|
||||
{
|
||||
dl::get_active() = true;
|
||||
dl::get_inited() = true;
|
||||
dl::_rocprofsys_dl_verbose = dl::get_rocprofsys_dl_env();
|
||||
if(dl::get_instrumented() < dl::InstrumentMode::PythonProfile)
|
||||
|
||||
if(dl::get_instrumented() >= dl::InstrumentMode::None &&
|
||||
dl::get_instrumented() < dl::InstrumentMode::PythonProfile)
|
||||
{
|
||||
dl::rocprofsys_postinit((c) ? std::string{ c } : std::string{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1069,43 +1070,17 @@ extern "C"
|
||||
|
||||
//----------------------------------------------------------------------------------//
|
||||
//
|
||||
// HSA
|
||||
// ROCm
|
||||
//
|
||||
//----------------------------------------------------------------------------------//
|
||||
|
||||
#if ROCPROFSYS_USE_ROCTRACER > 0
|
||||
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
|
||||
const char* const* failed_tool_names)
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
rocprofiler_tool_configure_result_t* rocprofiler_configure(
|
||||
uint32_t version, const char* runtime_version, uint32_t priority,
|
||||
rocprofiler_client_id_t* client_id)
|
||||
{
|
||||
return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_load_f, table, runtime_version,
|
||||
failed_tool_count, failed_tool_names);
|
||||
}
|
||||
|
||||
void OnUnload() { return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_unload_f); }
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------------//
|
||||
//
|
||||
// ROCP
|
||||
//
|
||||
//----------------------------------------------------------------------------------//
|
||||
|
||||
#if ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
void OnLoadToolProp(void* settings)
|
||||
{
|
||||
ROCPROFSYS_DL_LOG(
|
||||
-16,
|
||||
"invoking %s(rocprofiler_settings_t*) within librocprof-sys-dl.so "
|
||||
"will cause a silent failure for rocprofiler. ROCP_TOOL_LIB "
|
||||
"should be set to librocprof-sys.so\n",
|
||||
__FUNCTION__);
|
||||
abort();
|
||||
return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_load_tool_prop_f, settings);
|
||||
}
|
||||
|
||||
void OnUnloadTool()
|
||||
{
|
||||
return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_unload_tool_f);
|
||||
return ROCPROFSYS_DL_INVOKE(get_indirect().rocprofiler_configure_f, version,
|
||||
runtime_version, priority, client_id);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1227,7 +1202,9 @@ rocprofsys_preinit()
|
||||
void
|
||||
rocprofsys_postinit(std::string _exe)
|
||||
{
|
||||
switch(get_instrumented())
|
||||
InstrumentMode instrumentMode = get_instrumented();
|
||||
|
||||
switch(instrumentMode)
|
||||
{
|
||||
case InstrumentMode::None:
|
||||
case InstrumentMode::BinaryRewrite:
|
||||
@@ -1393,20 +1370,122 @@ verify_instrumented_preloaded()
|
||||
|
||||
bool _handle_preload = rocprofsys_preload();
|
||||
main_func_t main_real = nullptr;
|
||||
init_func_t init_real = nullptr;
|
||||
} // namespace
|
||||
} // namespace dl
|
||||
} // namespace rocprofsys
|
||||
|
||||
extern "C"
|
||||
{
|
||||
void rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API;
|
||||
int rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
void rocprofsys_set_main_init(init_func_t) ROCPROFSYS_INTERNAL_API;
|
||||
void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
void rocprofsys_set_main_init(init_func_t _init_real)
|
||||
{
|
||||
::rocprofsys::dl::init_real = _init_real;
|
||||
}
|
||||
|
||||
void rocprofsys_set_main(main_func_t _main_real)
|
||||
{
|
||||
::rocprofsys::dl::main_real = _main_real;
|
||||
}
|
||||
|
||||
// void rocprofsys_main_init(int argc, char** argv, char** envp)
|
||||
// {
|
||||
// ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
|
||||
// using ::rocprofsys::common::get_env;
|
||||
// using ::rocprofsys::dl::get_default_mode;
|
||||
|
||||
// // prevent re-entry
|
||||
// static int _reentry = 0;
|
||||
// if(_reentry > 0) return -1;
|
||||
// _reentry = 1;
|
||||
|
||||
// int ret = 0;
|
||||
|
||||
// if(::rocprofsys::dl::init_real)
|
||||
// {
|
||||
// if(envp)
|
||||
// {
|
||||
// size_t _idx = 0;
|
||||
// while(envp[_idx] != nullptr)
|
||||
// {
|
||||
// auto _env_v = std::string_view{ envp[_idx++] };
|
||||
// if(_env_v.find("ROCPROFSYS") != 0 &&
|
||||
// _env_v.find("librocprof-sys") == std::string_view::npos)
|
||||
// continue;
|
||||
// auto _pos = _env_v.find('=');
|
||||
// if(_pos < _env_v.length())
|
||||
// {
|
||||
// auto _var = std::string{ _env_v }.substr(0, _pos);
|
||||
// auto _val = std::string{ _env_v }.substr(_pos + 1);
|
||||
// ROCPROFSYS_DL_LOG(1, "%s(%s, %s)\n", "rocprofsys_set_env",
|
||||
// _var.c_str(), _val.c_str());
|
||||
// setenv(_var.c_str(), _val.c_str(), 0);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// ret = (*::rocprofsys::dl::init_real)(argc, argv, envp);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// ROCPROFSYS_DL_LOG(
|
||||
// 0, "%s\n",
|
||||
// "Unsuccessful wrapping of init: nullptr to real init function");
|
||||
// }
|
||||
|
||||
// auto _mode = get_env("ROCPROFSYS_MODE", get_default_mode());
|
||||
// rocprofsys_init(_mode.c_str(),
|
||||
// dl::get_instrumented() == dl::InstrumentMode::BinaryRewrite,
|
||||
// argv[0]);
|
||||
|
||||
// return ret;
|
||||
// }
|
||||
|
||||
// int rocprofsys_main(int argc, char** argv, char** envp)
|
||||
// {
|
||||
// ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
|
||||
|
||||
// // prevent re-entry
|
||||
// static int _reentry = 0;
|
||||
// if(_reentry > 0) return -1;
|
||||
// _reentry = 1;
|
||||
|
||||
// if(!::rocprofsys::dl::main_real)
|
||||
// throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main:
|
||||
// "
|
||||
// "nullptr to real main function");
|
||||
|
||||
// rocprofsys_push_trace(basename(argv[0]));
|
||||
|
||||
// int ret = (*::rocprofsys::dl::main_real)(argc, argv, envp);
|
||||
|
||||
// rocprofsys_pop_trace(basename(argv[0]));
|
||||
// rocprofsys_finalize();
|
||||
|
||||
// return ret;
|
||||
// }
|
||||
|
||||
void rocprofsys_main_init(void)
|
||||
{
|
||||
ROCPROFSYS_DL_LOG(0, "[%s].\n", __FUNCTION__);
|
||||
|
||||
if(::rocprofsys::dl::init_real)
|
||||
{
|
||||
// Call real init function
|
||||
(*::rocprofsys::dl::init_real)();
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCPROFSYS_DL_LOG(
|
||||
0, "Unsuccessful wrapping of init: real_init function is nullptr.\n");
|
||||
}
|
||||
}
|
||||
|
||||
int rocprofsys_main(int argc, char** argv, char** envp)
|
||||
{
|
||||
ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
|
||||
@@ -1420,7 +1499,7 @@ extern "C"
|
||||
|
||||
if(!::rocprofsys::dl::main_real)
|
||||
throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main: "
|
||||
"nullptr to real main function");
|
||||
"real_main function is nullptr.");
|
||||
|
||||
if(envp)
|
||||
{
|
||||
@@ -1455,4 +1534,4 @@ extern "C"
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
} // extern "C"
|
||||
|
||||
@@ -53,12 +53,8 @@
|
||||
# define ROCPROFSYS_USE_OMPT 0
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
# define ROCPROFSYS_USE_ROCTRACER 0
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
# define ROCPROFSYS_USE_ROCPROFILER 0
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
# define ROCPROFSYS_USE_ROCM 0
|
||||
#endif
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
@@ -177,20 +173,12 @@ extern "C"
|
||||
const char*) ROCPROFSYS_PUBLIC_API;
|
||||
# endif
|
||||
|
||||
# if ROCPROFSYS_USE_ROCTRACER > 0
|
||||
// HSA
|
||||
struct HsaApiTable;
|
||||
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
|
||||
const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API;
|
||||
void OnUnload() ROCPROFSYS_PUBLIC_API;
|
||||
# if ROCPROFSYS_USE_ROCM > 0
|
||||
struct rocprofiler_tool_configure_result_t;
|
||||
struct rocprofiler_client_id_t;
|
||||
# endif
|
||||
|
||||
# if ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
// ROCP
|
||||
void OnLoadToolProp(void* settings) ROCPROFSYS_PUBLIC_API;
|
||||
void OnUnloadTool() ROCPROFSYS_PUBLIC_API;
|
||||
# endif
|
||||
#endif
|
||||
#endif // ROCPROFSYS_DL_SOURCE
|
||||
}
|
||||
|
||||
namespace rocprofsys
|
||||
|
||||
@@ -37,20 +37,19 @@
|
||||
// local type definitions
|
||||
//
|
||||
typedef int (*main_func_t)(int, char**, char**);
|
||||
typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**,
|
||||
int (*)(int, char**, char**), void (*)(void), void (*)(void),
|
||||
void*);
|
||||
typedef void (*init_func_t)(void);
|
||||
typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**, void (*)(void),
|
||||
void (*)(void), void (*)(void), void*);
|
||||
|
||||
//
|
||||
// local function declarations
|
||||
//
|
||||
int
|
||||
rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**,
|
||||
int (*)(int, char**, char**), void (*)(void), void (*)(void),
|
||||
void*) ROCPROFSYS_INTERNAL_API;
|
||||
rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void),
|
||||
void (*)(void), void (*)(void), void*) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
int
|
||||
__libc_start_main(int (*)(int, char**, char**), int, char**, int (*)(int, char**, char**),
|
||||
__libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void),
|
||||
void (*)(void), void (*)(void), void*) ROCPROFSYS_PUBLIC_API;
|
||||
|
||||
//
|
||||
@@ -79,12 +78,18 @@ basename(const char*);
|
||||
|
||||
extern void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
extern void
|
||||
rocprofsys_set_main_init(init_func_t func) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
extern void
|
||||
rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
extern int
|
||||
rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API;
|
||||
|
||||
int
|
||||
rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv,
|
||||
int (*_init)(int, char**, char**), void (*_fini)(void),
|
||||
void (*_init)(void), void (*_fini)(void),
|
||||
void (*_rtld_fini)(void), void* _stack_end)
|
||||
{
|
||||
int _preload = rocprofsys_preload_library();
|
||||
@@ -97,8 +102,9 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
|
||||
// get the address of this function
|
||||
void* _this_func = __builtin_return_address(0);
|
||||
|
||||
// Save the real main function address
|
||||
// Save the real main function addresses
|
||||
rocprofsys_set_main(_main);
|
||||
rocprofsys_set_main_init(_init);
|
||||
|
||||
// Find the real __libc_start_main()
|
||||
start_main_t user_main = dlsym(RTLD_NEXT, "__libc_start_main");
|
||||
@@ -115,6 +121,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
|
||||
}
|
||||
else
|
||||
{
|
||||
// return user_main(rocprofsys_main, _argc, _argv,
|
||||
// rocprofsys_main_init, _fini,
|
||||
// _rtld_fini, _stack_end);
|
||||
|
||||
// call rocprof-sys main function wrapper
|
||||
return user_main(rocprofsys_main, _argc, _argv, _init, _fini, _rtld_fini,
|
||||
_stack_end);
|
||||
@@ -129,9 +139,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
|
||||
|
||||
int
|
||||
__libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv,
|
||||
int (*_init)(int, char**, char**), void (*_fini)(void),
|
||||
void (*_rtld_fini)(void), void* _stack_end)
|
||||
void (*_init)(void), void (*_fini)(void), void (*_rtld_fini)(void),
|
||||
void* _stack_end)
|
||||
{
|
||||
// intercept the main function
|
||||
return rocprofsys_libc_start_main(_main, _argc, _argv, _init, _fini, _rtld_fini,
|
||||
_stack_end);
|
||||
}
|
||||
|
||||
+9
-7
@@ -43,19 +43,21 @@ extern "C"
|
||||
ROCPROFSYS_CATEGORY_PYTHON,
|
||||
ROCPROFSYS_CATEGORY_USER,
|
||||
ROCPROFSYS_CATEGORY_HOST,
|
||||
ROCPROFSYS_CATEGORY_DEVICE_HIP,
|
||||
ROCPROFSYS_CATEGORY_DEVICE_HSA,
|
||||
ROCPROFSYS_CATEGORY_ROCM_HIP,
|
||||
ROCPROFSYS_CATEGORY_ROCM_HSA,
|
||||
ROCPROFSYS_CATEGORY_ROCM_ROCTX,
|
||||
ROCPROFSYS_CATEGORY_ROCM,
|
||||
ROCPROFSYS_CATEGORY_ROCM_HIP_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_HSA_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH,
|
||||
ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY,
|
||||
ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION,
|
||||
ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION,
|
||||
ROCPROFSYS_CATEGORY_ROCM_MARKER_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SMI,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SMI_POWER,
|
||||
ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE,
|
||||
ROCPROFSYS_CATEGORY_ROCM_RCCL,
|
||||
ROCPROFSYS_CATEGORY_ROCTRACER,
|
||||
ROCPROFSYS_CATEGORY_ROCPROFILER,
|
||||
ROCPROFSYS_CATEGORY_SAMPLING,
|
||||
ROCPROFSYS_CATEGORY_PTHREAD,
|
||||
ROCPROFSYS_CATEGORY_KOKKOS,
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
//
|
||||
#include "api.hpp"
|
||||
#include "common/setup.hpp"
|
||||
#include "common/static_object.hpp"
|
||||
#include "core/categories.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/concepts.hpp"
|
||||
@@ -46,13 +47,12 @@
|
||||
#include "library/components/mpi_gotcha.hpp"
|
||||
#include "library/components/numa_gotcha.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/rocprofiler.hpp"
|
||||
#include "library/coverage.hpp"
|
||||
#include "library/ompt.hpp"
|
||||
#include "library/process_sampler.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
#include "library/rcclp.hpp"
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "library/rocprofiler-sdk.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
@@ -399,10 +399,6 @@ rocprofsys_init_library_hidden()
|
||||
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", _debug_value);
|
||||
} };
|
||||
|
||||
tim::trait::runtime_enabled<comp::roctracer>::set(get_use_roctracer());
|
||||
tim::trait::runtime_enabled<comp::roctracer_data>::set(get_use_roctracer() &&
|
||||
get_use_timemory());
|
||||
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
|
||||
}
|
||||
|
||||
@@ -718,13 +714,6 @@ rocprofsys_finalize_hidden(void)
|
||||
}
|
||||
}
|
||||
|
||||
if(get_use_roctracer())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Flushing roctracer...\n");
|
||||
// ensure that roctracer is flushed before setting the state to finalized
|
||||
comp::roctracer::flush();
|
||||
}
|
||||
|
||||
set_state(State::Finalized);
|
||||
|
||||
push_enable_sampling_on_child_threads(false);
|
||||
@@ -785,6 +774,14 @@ rocprofsys_finalize_hidden(void)
|
||||
ompt::shutdown();
|
||||
}
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
// TODO: option for rocm
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down ROCm...\n");
|
||||
rocprofiler_sdk::shutdown();
|
||||
}
|
||||
#endif
|
||||
|
||||
ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
|
||||
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
|
||||
{
|
||||
@@ -835,24 +832,6 @@ rocprofsys_finalize_hidden(void)
|
||||
process_sampler::shutdown();
|
||||
}
|
||||
|
||||
if(get_use_roctracer())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down roctracer...\n");
|
||||
// ensure that threads running roctracer callbacks shutdown
|
||||
comp::roctracer::shutdown();
|
||||
|
||||
// join extra thread(s) used by roctracer
|
||||
ROCPROFSYS_VERBOSE_F(2, "Waiting on roctracer tasks...\n");
|
||||
tasking::join();
|
||||
}
|
||||
|
||||
if(get_use_rocprofiler())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down rocprofiler...\n");
|
||||
rocprofiler::post_process();
|
||||
rocprofiler::rocm_cleanup();
|
||||
}
|
||||
|
||||
if(get_use_causal())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down causal sampling...\n");
|
||||
@@ -919,7 +898,7 @@ rocprofsys_finalize_hidden(void)
|
||||
process_sampler::post_process();
|
||||
}
|
||||
|
||||
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
|
||||
// shutdown tasking before timemory is finalized
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down thread-pools...\n");
|
||||
tasking::shutdown();
|
||||
|
||||
@@ -991,6 +970,8 @@ rocprofsys_finalize_hidden(void)
|
||||
tim::signals::enable_signal_detection(
|
||||
{ tim::signals::sys_signal::SegFault, tim::signals::sys_signal::Stop },
|
||||
[](int) {});
|
||||
|
||||
common::destroy_static_objects();
|
||||
}
|
||||
|
||||
//======================================================================================//
|
||||
|
||||
@@ -23,8 +23,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocm.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocm_smi.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/runtime.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/sampling.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/thread_data.hpp
|
||||
@@ -35,37 +34,23 @@ set(library_headers
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources}
|
||||
${library_headers})
|
||||
|
||||
if(ROCPROFSYS_USE_ROCTRACER OR ROCPROFSYS_USE_ROCPROFILER)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCTRACER)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_RCCL)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
target_sources(
|
||||
rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCM_SMI)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp)
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp)
|
||||
add_subdirectory(rocprofiler-sdk)
|
||||
endif()
|
||||
|
||||
add_subdirectory(causal)
|
||||
add_subdirectory(components)
|
||||
add_subdirectory(coverage)
|
||||
add_subdirectory(rocm)
|
||||
add_subdirectory(tracing)
|
||||
|
||||
set(ndebug_sources
|
||||
|
||||
@@ -28,8 +28,6 @@ set(component_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp)
|
||||
@@ -37,16 +35,6 @@ set(component_headers
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources}
|
||||
${component_headers})
|
||||
|
||||
if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCTRACER)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_RCCL)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
|
||||
|
||||
+4
-4
@@ -64,13 +64,13 @@ using tim::type_list;
|
||||
// these categories increment push/pop counts, which are used for sanity checks since
|
||||
// they should ALWAYS be popped if they were pushed
|
||||
using tracing_count_categories_t =
|
||||
type_list<category::host, category::mpi, category::pthread, category::rocm_hip,
|
||||
category::rocm_hsa, category::rocm_rccl>;
|
||||
type_list<category::host, category::mpi, category::pthread, category::rocm_hip_api,
|
||||
category::rocm_hsa_api, category::rocm_rccl>;
|
||||
|
||||
// convert these categories to throughput points
|
||||
using causal_throughput_categories_t =
|
||||
type_list<category::host, category::kokkos, category::ompt, category::rocm_hip,
|
||||
category::rocm_hsa, category::rocm_rccl, category::rocm_roctx>;
|
||||
type_list<category::host, category::kokkos, category::ompt, category::rocm_hip_api,
|
||||
category::rocm_hsa_api, category::rocm_rccl, category::rocm_marker_api>;
|
||||
|
||||
// define this outside of category region functions so that the
|
||||
// static thread_local is global instead of per-template instantiation
|
||||
|
||||
+1
-5
@@ -28,7 +28,6 @@
|
||||
#include "core/utility.hpp"
|
||||
#include "library/causal/delay.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
@@ -61,7 +60,7 @@ shutdown();
|
||||
|
||||
namespace component
|
||||
{
|
||||
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
|
||||
using bundle_t = tim::lightweight_tuple<comp::wall_clock>;
|
||||
using category_region_t = tim::lightweight_tuple<category_region<category::pthread>>;
|
||||
|
||||
namespace
|
||||
@@ -82,7 +81,6 @@ inline void
|
||||
start_bundle(bundle_t& _bundle, int64_t _tid, Args&&... _args)
|
||||
{
|
||||
if(!get_use_timemory() && !get_use_perfetto()) return;
|
||||
trait::runtime_enabled<comp::roctracer_data>::set(get_use_roctracer());
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(3, "starting bundle '%s' in thread %li...\n",
|
||||
_bundle.key().c_str(), _tid);
|
||||
if constexpr(sizeof...(Args) > 0)
|
||||
@@ -619,5 +617,3 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
}
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
TIMEMORY_INITIALIZE_STORAGE(component::roctracer_data)
|
||||
|
||||
@@ -1,193 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/rocprofiler.hpp"
|
||||
#include "core/common.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/dynamic_library.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
#include "core/redirect.hpp"
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <timemory/storage/types.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
#include <timemory/variadic/functional.hpp>
|
||||
#include <timemory/variadic/lightweight_tuple.hpp>
|
||||
|
||||
#include <rocprofiler.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
namespace
|
||||
{
|
||||
auto&
|
||||
rocprofiler_activity_count()
|
||||
{
|
||||
static std::atomic<int64_t> _v{ 0 };
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
unique_ptr_t<rocm_data_t>&
|
||||
rocm_data(int64_t _tid)
|
||||
{
|
||||
using thread_data_t = thread_data<rocm_data_t, rocm_event>;
|
||||
return thread_data_t::instance(construct_on_thread{ _tid });
|
||||
}
|
||||
|
||||
rocm_event::rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue,
|
||||
std::string _event_name, rocm_metric_type _begin,
|
||||
rocm_metric_type _end, uint32_t _feature_count, void* _features_v)
|
||||
: device_id{ _dev }
|
||||
, thread_id{ _thr }
|
||||
, queue_id{ _queue }
|
||||
, entry{ _begin }
|
||||
, exit{ _end }
|
||||
, name(std::move(_event_name))
|
||||
{
|
||||
feature_values.reserve(_feature_count);
|
||||
feature_names.reserve(_feature_count);
|
||||
auto* _features = static_cast<rocprofiler_feature_t*>(_features_v);
|
||||
for(uint32_t i = 0; i < _feature_count; ++i)
|
||||
{
|
||||
const rocprofiler_feature_t* p = &_features[i];
|
||||
feature_names.emplace_back(i);
|
||||
switch(p->data.kind)
|
||||
{
|
||||
// Output metrics results
|
||||
case ROCPROFILER_DATA_KIND_UNINIT: break;
|
||||
case ROCPROFILER_DATA_KIND_BYTES:
|
||||
feature_values.emplace_back(
|
||||
rocm_feature_value{ p->data.result_bytes.size });
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_INT32:
|
||||
feature_values.emplace_back(rocm_feature_value{ p->data.result_int32 });
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_FLOAT:
|
||||
feature_values.emplace_back(rocm_feature_value{ p->data.result_float });
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_DOUBLE:
|
||||
feature_values.emplace_back(rocm_feature_value{ p->data.result_double });
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_INT64:
|
||||
feature_values.emplace_back(rocm_feature_value{ p->data.result_int64 });
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
rocm_event::as_string() const
|
||||
{
|
||||
std::stringstream _ss{};
|
||||
_ss << name << ", device: " << device_id << ", queue: " << queue_id
|
||||
<< ", thread: " << thread_id << ", entry: " << entry << ", exit = " << exit;
|
||||
_ss.precision(3);
|
||||
_ss << std::fixed;
|
||||
for(size_t i = 0; i < feature_names.size(); ++i)
|
||||
{
|
||||
auto _name = rocprofsys::rocprofiler::get_data_labels().at(device_id).at(
|
||||
feature_names.at(i));
|
||||
_ss << ", " << _name << " = ";
|
||||
auto _as_string = [&_ss](auto&& itr) { _ss << std::setw(4) << itr; };
|
||||
std::visit(_as_string, feature_values.at(i));
|
||||
}
|
||||
return _ss.str();
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler::preinit()
|
||||
{
|
||||
rocprofiler_data::label() = "rocprofiler";
|
||||
rocprofiler_data::description() = "ROCm hardware counters";
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler::start()
|
||||
{
|
||||
if(tracker_type::start() == 0) setup();
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler::stop()
|
||||
{
|
||||
if(tracker_type::stop() == 0) shutdown();
|
||||
}
|
||||
|
||||
bool
|
||||
rocprofiler::is_setup()
|
||||
{
|
||||
return rocprofsys::rocprofiler::is_setup();
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler::add_setup(const std::string&, std::function<void()>&&)
|
||||
{}
|
||||
|
||||
void
|
||||
rocprofiler::add_shutdown(const std::string&, std::function<void()>&&)
|
||||
{}
|
||||
|
||||
void
|
||||
rocprofiler::remove_setup(const std::string&)
|
||||
{}
|
||||
|
||||
void
|
||||
rocprofiler::remove_shutdown(const std::string&)
|
||||
{}
|
||||
|
||||
void
|
||||
rocprofiler::setup()
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "rocprofiler is setup\n");
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler::shutdown()
|
||||
{
|
||||
rocprofsys::rocprofiler::post_process();
|
||||
rocprofsys::rocprofiler::rocm_cleanup();
|
||||
ROCPROFSYS_VERBOSE_F(1, "rocprofiler is shutdown\n");
|
||||
}
|
||||
|
||||
scope::transient_destructor
|
||||
rocprofiler::protect_flush_activity()
|
||||
{
|
||||
return scope::transient_destructor([]() { --rocprofiler_activity_count(); },
|
||||
[]() { ++rocprofiler_activity_count(); });
|
||||
}
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler, false, void)
|
||||
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler_data, true,
|
||||
tim::component::rocprofiler_value)
|
||||
@@ -1,241 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <timemory/api.hpp>
|
||||
#include <timemory/backends/hardware_counters.hpp>
|
||||
#include <timemory/components/base.hpp>
|
||||
#include <timemory/components/data_tracker/components.hpp>
|
||||
#include <timemory/components/macros.hpp>
|
||||
#include <timemory/enum.h>
|
||||
#include <timemory/macros.hpp>
|
||||
#include <timemory/macros/os.hpp>
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/mpl/macros.hpp>
|
||||
#include <timemory/mpl/type_traits.hpp>
|
||||
#include <timemory/mpl/types.hpp>
|
||||
#include <timemory/utility/transient_function.hpp>
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
using rocm_metric_type = unsigned long long;
|
||||
using rocm_info_entry = ::tim::hardware_counters::info;
|
||||
using rocm_feature_value = std::variant<uint32_t, float, uint64_t, double>;
|
||||
|
||||
struct rocm_counter
|
||||
{
|
||||
std::array<rocm_metric_type, ROCPROFSYS_ROCM_MAX_COUNTERS> counters;
|
||||
};
|
||||
|
||||
struct rocm_event
|
||||
{
|
||||
using value_type = rocm_feature_value;
|
||||
|
||||
uint32_t device_id = 0;
|
||||
uint32_t thread_id = 0;
|
||||
uint32_t queue_id = 0;
|
||||
rocm_metric_type entry = 0;
|
||||
rocm_metric_type exit = 0;
|
||||
std::string name = {};
|
||||
std::vector<size_t> feature_names = {};
|
||||
std::vector<rocm_feature_value> feature_values = {};
|
||||
|
||||
rocm_event() = default;
|
||||
rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue, std::string _event_name,
|
||||
rocm_metric_type begin, rocm_metric_type end, uint32_t _feature_count,
|
||||
void* _features);
|
||||
|
||||
std::string as_string() const;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& _os, const rocm_event& _v)
|
||||
{
|
||||
return (_os << _v.as_string());
|
||||
}
|
||||
|
||||
friend bool operator<(const rocm_event& _lhs, const rocm_event& _rhs)
|
||||
{
|
||||
return std::tie(_lhs.device_id, _lhs.queue_id, _lhs.entry, _lhs.thread_id) <
|
||||
std::tie(_rhs.device_id, _rhs.queue_id, _rhs.entry, _rhs.thread_id);
|
||||
}
|
||||
};
|
||||
|
||||
using rocm_data_t = std::vector<rocm_event>;
|
||||
using rocm_data_tracker = data_tracker<rocm_feature_value, rocm_event>;
|
||||
|
||||
rocprofsys::unique_ptr_t<rocm_data_t>&
|
||||
rocm_data(int64_t _tid = threading::get_id());
|
||||
|
||||
using rocprofiler_value = typename rocm_event::value_type;
|
||||
using rocprofiler_data = data_tracker<rocprofiler_value, rocprofiler>;
|
||||
|
||||
struct rocprofiler
|
||||
: base<rocprofiler, void>
|
||||
, private policy::instance_tracker<rocprofiler, false>
|
||||
{
|
||||
using value_type = void;
|
||||
using base_type = base<rocprofiler, void>;
|
||||
using tracker_type = policy::instance_tracker<rocprofiler, false>;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(rocprofiler)
|
||||
|
||||
static void preinit();
|
||||
static void global_init() { setup(); }
|
||||
static void global_finalize() { shutdown(); }
|
||||
|
||||
static bool is_setup();
|
||||
static void setup();
|
||||
static void shutdown();
|
||||
static void add_setup(const std::string&, std::function<void()>&&);
|
||||
static void add_shutdown(const std::string&, std::function<void()>&&);
|
||||
static void remove_setup(const std::string&);
|
||||
static void remove_shutdown(const std::string&);
|
||||
|
||||
void start();
|
||||
void stop();
|
||||
|
||||
// this function protects rocprofiler_flush_activty from being called
|
||||
// when rocprof-sys exits during a callback
|
||||
[[nodiscard]] static scope::transient_destructor protect_flush_activity();
|
||||
};
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
inline void
|
||||
rocprofiler::setup()
|
||||
{}
|
||||
|
||||
inline void
|
||||
rocprofiler::shutdown()
|
||||
{}
|
||||
|
||||
inline bool
|
||||
rocprofiler::is_setup()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
using ::rocprofsys::component::rocm_data_tracker;
|
||||
using ::rocprofsys::component::rocm_feature_value;
|
||||
using ::rocprofsys::component::rocprofiler_data;
|
||||
using ::rocprofsys::component::rocprofiler_value;
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace operation
|
||||
{
|
||||
template <>
|
||||
struct set_storage<component::rocm_data_tracker>
|
||||
{
|
||||
using T = component::rocm_data_tracker;
|
||||
static constexpr size_t max_threads = 4096;
|
||||
using type = T;
|
||||
using storage_array_t = std::array<storage<type>*, max_threads>;
|
||||
friend struct get_storage<component::rocm_data_tracker>;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(set_storage)
|
||||
|
||||
auto operator()(storage<type>*, size_t) const {}
|
||||
auto operator()(type&, size_t) const {}
|
||||
auto operator()(storage<type>* _v) const { get().fill(_v); }
|
||||
|
||||
private:
|
||||
static storage_array_t& get()
|
||||
{
|
||||
static storage_array_t _v = { nullptr };
|
||||
return _v;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_storage<component::rocm_data_tracker>
|
||||
{
|
||||
using type = component::rocm_data_tracker;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(get_storage)
|
||||
|
||||
auto operator()(const type&) const
|
||||
{
|
||||
return operation::set_storage<type>::get().at(0);
|
||||
}
|
||||
|
||||
auto operator()() const
|
||||
{
|
||||
type _obj{};
|
||||
return (*this)(_obj);
|
||||
}
|
||||
|
||||
auto operator()(size_t _idx) const
|
||||
{
|
||||
return operation::set_storage<type>::get().at(_idx);
|
||||
}
|
||||
|
||||
auto operator()(type&, size_t _idx) const { return (*this)(_idx); }
|
||||
};
|
||||
} // namespace operation
|
||||
} // namespace tim
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler_data, false_type)
|
||||
#endif
|
||||
|
||||
TIMEMORY_SET_COMPONENT_API(component::rocprofiler_data, project::timemory,
|
||||
category::timing, os::supports_unix)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::rocprofiler_data,
|
||||
false_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::rocprofiler_data,
|
||||
false_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocprofiler_data, false_type)
|
||||
TIMEMORY_STATISTICS_TYPE(component::rocprofiler_data, component::rocprofiler_value)
|
||||
TIMEMORY_STATISTICS_TYPE(component::rocm_data_tracker, component::rocm_feature_value)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocm_data_tracker, false_type)
|
||||
|
||||
#if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
|
||||
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
|
||||
|
||||
# include <timemory/operations.hpp>
|
||||
|
||||
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler, false, void)
|
||||
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler_data, true, double)
|
||||
|
||||
#endif
|
||||
@@ -1,396 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "core/common.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/dynamic_library.hpp"
|
||||
#include "core/redirect.hpp"
|
||||
#include "library/roctracer.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
|
||||
#include <chrono>
|
||||
#include <roctracer.h>
|
||||
|
||||
#define HIP_PROF_HIP_API_STRING 1
|
||||
|
||||
#include <roctracer_ext.h>
|
||||
#include <roctracer_hip.h>
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION < 50300
|
||||
# include <roctracer_hcc.h>
|
||||
#endif
|
||||
|
||||
#define AMD_INTERNAL_BUILD 1
|
||||
#include <roctracer_hsa.h>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
namespace
|
||||
{
|
||||
auto&
|
||||
roctracer_activity_count()
|
||||
{
|
||||
static std::atomic<int64_t> _v{ 0 };
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
roctracer::preinit()
|
||||
{
|
||||
roctracer_data::label() = "roctracer";
|
||||
roctracer_data::description() = "ROCm tracer (activity API)";
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::start()
|
||||
{
|
||||
if(tracker_type::start() == 0) setup(nullptr);
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::stop()
|
||||
{
|
||||
if(tracker_type::stop() == 0) shutdown();
|
||||
}
|
||||
|
||||
bool
|
||||
roctracer::is_setup()
|
||||
{
|
||||
return roctracer_is_setup();
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::add_setup(const std::string& _lbl, std::function<void()>&& _func)
|
||||
{
|
||||
roctracer_setup_routines().emplace_back(_lbl, std::move(_func));
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::add_shutdown(const std::string& _lbl, std::function<void()>&& _func)
|
||||
{
|
||||
roctracer_shutdown_routines().emplace_back(_lbl, std::move(_func));
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::remove_setup(const std::string& _lbl)
|
||||
{
|
||||
auto& _data = roctracer_setup_routines();
|
||||
for(auto itr = _data.begin(); itr != _data.end(); ++itr)
|
||||
{
|
||||
if(itr->first == _lbl)
|
||||
{
|
||||
_data.erase(itr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::remove_shutdown(const std::string& _lbl)
|
||||
{
|
||||
auto& _data = roctracer_setup_routines();
|
||||
for(auto itr = _data.begin(); itr != _data.end(); ++itr)
|
||||
{
|
||||
if(itr->first == _lbl)
|
||||
{
|
||||
_data.erase(itr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::setup(void* table, bool on_load_trace)
|
||||
{
|
||||
if(!get_use_roctracer()) return;
|
||||
|
||||
auto_lock_t _lk{ type_mutex<roctracer>() };
|
||||
if(roctracer_is_setup()) return;
|
||||
roctracer_is_setup() = true;
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1, "setting up roctracer...\n");
|
||||
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
|
||||
|
||||
dynamic_library _amdhip64{ "ROCPROFSYS_ROCTRACER_LIBAMDHIP64",
|
||||
find_library_path("libamdhip64.so",
|
||||
{ "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
|
||||
{ ROCPROFSYS_DEFAULT_ROCM_PATH }) };
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR < 4
|
||||
dynamic_library _kfdwrapper{
|
||||
"ROCPROFSYS_ROCTRACER_LIBKFDWRAPPER",
|
||||
find_library_path("libkfdwrapper64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
|
||||
{ ROCPROFSYS_DEFAULT_ROCM_PATH },
|
||||
{ "roctracer/lib", "roctracer/lib64", "lib", "lib64" })
|
||||
};
|
||||
#endif
|
||||
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr));
|
||||
|
||||
// Allocating tracing pool
|
||||
roctracer_properties_t properties{};
|
||||
memset(&properties, 0, sizeof(roctracer_properties_t));
|
||||
// properties.mode = 0x1000;
|
||||
properties.buffer_size = 0x100;
|
||||
properties.buffer_callback_fun = hip_activity_callback;
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_open_pool(&properties));
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4
|
||||
// HIP 4.5.0 has an invalid warning
|
||||
redirect _rd{ std::cerr, "roctracer_enable_callback(), get_op_end(), invalid domain "
|
||||
"ID(4) in: roctracer_enable_callback(hip_api_callback, "
|
||||
"nullptr)roctracer_enable_activity_expl(), get_op_end(), "
|
||||
"invalid domain ID(4) in: roctracer_enable_activity()" };
|
||||
#endif
|
||||
|
||||
if(get_trace_hip_api())
|
||||
{
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
|
||||
ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr));
|
||||
}
|
||||
|
||||
if(get_use_roctx())
|
||||
{
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
|
||||
ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, nullptr));
|
||||
}
|
||||
|
||||
if(get_trace_hip_activity())
|
||||
{
|
||||
// Enable HIP activity tracing
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS));
|
||||
}
|
||||
|
||||
if(table != nullptr)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(1 || on_load_trace, "[OnLoad] setting up HSA...\n");
|
||||
|
||||
bool trace_hsa_api = get_trace_hsa_api();
|
||||
|
||||
// Enable HSA API callbacks/activity
|
||||
if(trace_hsa_api)
|
||||
{
|
||||
std::vector<std::string> hsa_api_vec =
|
||||
tim::delimit(get_trace_hsa_api_types());
|
||||
|
||||
// initialize HSA tracing
|
||||
roctracer_set_properties(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), (void*) table);
|
||||
|
||||
if(!hsa_api_vec.empty())
|
||||
{
|
||||
for(const auto& itr : hsa_api_vec)
|
||||
{
|
||||
uint32_t cid = HSA_API_ID_NUMBER;
|
||||
const char* api = itr.c_str();
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_op_code(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), api,
|
||||
&cid, nullptr));
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_callback(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), cid,
|
||||
hsa_api_callback, nullptr));
|
||||
|
||||
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace(%s)", api);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace()\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API),
|
||||
hsa_api_callback, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
bool trace_hsa_activity = get_trace_hsa_activity();
|
||||
// Enable HSA GPU activity
|
||||
if(trace_hsa_activity)
|
||||
{
|
||||
#if ROCPROFSYS_HIP_VERSION < 50300
|
||||
using namespace roctracer;
|
||||
// initialize HSA tracing
|
||||
const char* output_prefix = nullptr;
|
||||
hsa_ops_properties_t ops_properties{
|
||||
table, reinterpret_cast<activity_async_callback_t>(hsa_activity_callback),
|
||||
nullptr, output_prefix
|
||||
};
|
||||
#elif ROCPROFSYS_HIP_VERSION < 50301
|
||||
hsa_ops_properties_t ops_properties;
|
||||
ops_properties.table = table;
|
||||
ops_properties.reserved1[0] = reinterpret_cast<void*>(&hsa_activity_callback);
|
||||
ops_properties.reserved1[1] = nullptr;
|
||||
ops_properties.reserved1[2] = nullptr;
|
||||
#else
|
||||
hsa_ops_properties_t ops_properties{
|
||||
table, reinterpret_cast<void*>(&hsa_activity_callback), nullptr, nullptr
|
||||
};
|
||||
#endif
|
||||
roctracer_set_properties(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_OPS), &ops_properties);
|
||||
|
||||
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-activity-trace()\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_activity(
|
||||
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_OPS), HSA_OP_ID_COPY));
|
||||
}
|
||||
}
|
||||
|
||||
// callback for HSA
|
||||
for(auto& itr : roctracer_setup_routines())
|
||||
itr.second();
|
||||
|
||||
// make sure all async callbacks are allocated
|
||||
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
|
||||
hip_exec_activity_callbacks(i);
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1, "roctracer is setup\n");
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::flush()
|
||||
{
|
||||
auto wait_for_activity_flush_completion = []() {
|
||||
uint16_t nitr = 0;
|
||||
while(roctracer_activity_count() > 0 && nitr++ < 10)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{ 100 });
|
||||
};
|
||||
|
||||
// a flush may already be happening
|
||||
wait_for_activity_flush_completion();
|
||||
|
||||
if(roctracer_activity_count() == 0)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(2, "executing roctracer_flush_activity()...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity());
|
||||
// wait to make sure flush completes
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{ 100 });
|
||||
wait_for_activity_flush_completion();
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCPROFSYS_CI_FAIL(true,
|
||||
"roctracer_activity_count() != 0 (== %li). "
|
||||
"roctracer::shutdown() most likely called during abort",
|
||||
roctracer_activity_count().load());
|
||||
}
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(2, "executing hip_exec_activity_callbacks(0..%zu)\n",
|
||||
thread_info::get_peak_num_threads());
|
||||
// make sure all async operations are executed
|
||||
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
|
||||
hip_exec_activity_callbacks(i);
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(2, "roctracer flush completed\n");
|
||||
}
|
||||
|
||||
void
|
||||
roctracer::shutdown()
|
||||
{
|
||||
auto_lock_t _lk{ type_mutex<roctracer>() };
|
||||
if(!roctracer_is_setup()) return;
|
||||
|
||||
roctracer_is_setup() = false;
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1, "shutting down roctracer...\n");
|
||||
|
||||
// callback for hsa
|
||||
ROCPROFSYS_VERBOSE_F(2, "executing %zu roctracer_shutdown_routines...\n",
|
||||
roctracer_shutdown_routines().size());
|
||||
for(auto& itr : roctracer_shutdown_routines())
|
||||
itr.second();
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4
|
||||
ROCPROFSYS_DEBUG_F("redirecting roctracer warnings\n");
|
||||
// HIP 4.5.0 has an invalid warning
|
||||
redirect _rd{
|
||||
std::cerr, "roctracer_disable_callback(), get_op_end(), invalid domain ID(4) "
|
||||
"in: roctracer_disable_callback()roctracer_disable_activity(), "
|
||||
"get_op_end(), invalid domain ID(4) in: roctracer_disable_activity()"
|
||||
};
|
||||
#endif
|
||||
|
||||
if(get_trace_hip_api())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(
|
||||
2,
|
||||
"executing roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API));
|
||||
}
|
||||
|
||||
if(get_use_roctx())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(
|
||||
2, "executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_ROCTX)...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX));
|
||||
}
|
||||
|
||||
if(get_trace_hip_activity())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(
|
||||
2,
|
||||
"executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS));
|
||||
}
|
||||
|
||||
if(get_trace_hsa_api())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(
|
||||
2,
|
||||
"executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_API)...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_API));
|
||||
}
|
||||
|
||||
if(get_trace_hsa_api())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(
|
||||
2, "executing roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, "
|
||||
"HSA_OP_ID_COPY)...\n");
|
||||
ROCPROFSYS_ROCTRACER_CALL(
|
||||
roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY));
|
||||
}
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1, "roctracer is shutdown\n");
|
||||
}
|
||||
|
||||
scope::transient_destructor
|
||||
roctracer::protect_flush_activity()
|
||||
{
|
||||
return scope::transient_destructor([]() { --roctracer_activity_count(); },
|
||||
[]() { ++roctracer_activity_count(); });
|
||||
}
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer, false, void)
|
||||
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer_data, true, double)
|
||||
@@ -1,117 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
|
||||
#include <timemory/api.hpp>
|
||||
#include <timemory/components/base.hpp>
|
||||
#include <timemory/components/data_tracker/components.hpp>
|
||||
#include <timemory/components/macros.hpp>
|
||||
#include <timemory/enum.h>
|
||||
#include <timemory/macros/os.hpp>
|
||||
#include <timemory/mpl/type_traits.hpp>
|
||||
#include <timemory/mpl/types.hpp>
|
||||
#include <timemory/utility/transient_function.hpp>
|
||||
|
||||
ROCPROFSYS_COMPONENT_ALIAS(roctracer_data,
|
||||
::tim::component::data_tracker<double, roctracer>)
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
struct roctracer
|
||||
: base<roctracer, void>
|
||||
, private policy::instance_tracker<roctracer, false>
|
||||
{
|
||||
using value_type = void;
|
||||
using base_type = base<roctracer, void>;
|
||||
using tracker_type = policy::instance_tracker<roctracer, false>;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(roctracer)
|
||||
|
||||
static void preinit();
|
||||
static void global_finalize() { shutdown(); }
|
||||
|
||||
static bool is_setup();
|
||||
static void setup(void* hsa_api_table, bool on_load_trace = false);
|
||||
static void flush();
|
||||
static void shutdown();
|
||||
static void add_setup(const std::string&, std::function<void()>&&);
|
||||
static void add_shutdown(const std::string&, std::function<void()>&&);
|
||||
static void remove_setup(const std::string&);
|
||||
static void remove_shutdown(const std::string&);
|
||||
|
||||
void start();
|
||||
void stop();
|
||||
|
||||
// this function protects roctracer_flush_activty from being called
|
||||
// when rocprof-sys exits during a callback
|
||||
[[nodiscard]] static scope::transient_destructor protect_flush_activity();
|
||||
};
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
inline void
|
||||
roctracer::setup(void*, bool)
|
||||
{}
|
||||
|
||||
inline void
|
||||
roctracer::flush()
|
||||
{}
|
||||
|
||||
inline void
|
||||
roctracer::shutdown()
|
||||
{}
|
||||
|
||||
inline bool
|
||||
roctracer::is_setup()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCTRACER)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer_data, false_type)
|
||||
#endif
|
||||
|
||||
TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer_data, project::timemory,
|
||||
category::timing, os::supports_unix)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type)
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
|
||||
# if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
|
||||
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
|
||||
|
||||
# include <timemory/operations.hpp>
|
||||
|
||||
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer, false, void)
|
||||
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double)
|
||||
|
||||
# endif
|
||||
#endif
|
||||
@@ -25,12 +25,8 @@
|
||||
#include "core/debug.hpp"
|
||||
#include "core/dynamic_library.hpp"
|
||||
#include "core/gpu.hpp"
|
||||
#include "library/components/rocprofiler.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/rocm/hsa_rsrc_factory.hpp"
|
||||
#include "library/rocm_smi.hpp"
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "library/roctracer.hpp"
|
||||
#include "library/rocprofiler-sdk.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/tracing.hpp"
|
||||
@@ -46,208 +42,18 @@
|
||||
#include <mutex>
|
||||
#include <tuple>
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
# include <rocprofiler.h>
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
# include <rocprofiler-sdk/rocprofiler.h>
|
||||
#endif
|
||||
|
||||
using namespace rocprofsys;
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocm
|
||||
{
|
||||
std::mutex rocm_mutex = {};
|
||||
bool is_loaded = false;
|
||||
bool on_load_trace = (get_env<int>("ROCP_ONLOAD_TRACE", 0) > 0);
|
||||
std::vector<hardware_counter_info>
|
||||
rocm_events()
|
||||
{
|
||||
return rocprofiler_sdk::get_rocm_events_info();
|
||||
}
|
||||
} // namespace rocm
|
||||
} // namespace rocprofsys
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
std::ostream&
|
||||
operator<<(std::ostream& _os, const rocprofiler_settings_t& _v)
|
||||
{
|
||||
# define ROCPROF_SETTING_FIELD_STR(NAME) JOIN('=', # NAME, _v.NAME)
|
||||
|
||||
_os << JOIN(
|
||||
", ", ROCPROF_SETTING_FIELD_STR(intercept_mode),
|
||||
ROCPROF_SETTING_FIELD_STR(code_obj_tracking),
|
||||
ROCPROF_SETTING_FIELD_STR(memcopy_tracking),
|
||||
ROCPROF_SETTING_FIELD_STR(trace_size), ROCPROF_SETTING_FIELD_STR(trace_local),
|
||||
ROCPROF_SETTING_FIELD_STR(timeout), ROCPROF_SETTING_FIELD_STR(timestamp_on),
|
||||
ROCPROF_SETTING_FIELD_STR(hsa_intercepting),
|
||||
ROCPROF_SETTING_FIELD_STR(k_concurrent), ROCPROF_SETTING_FIELD_STR(opt_mode),
|
||||
ROCPROF_SETTING_FIELD_STR(obj_dumping));
|
||||
return _os;
|
||||
}
|
||||
#endif
|
||||
|
||||
// HSA-runtime tool on-load method
|
||||
extern "C"
|
||||
{
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
void OnUnloadTool()
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n");
|
||||
|
||||
rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
|
||||
if(!_lk.owns_lock()) _lk.lock();
|
||||
|
||||
if(!rocm::is_loaded)
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace,
|
||||
"rocprofiler is not loaded\n");
|
||||
return;
|
||||
}
|
||||
rocm::is_loaded = false;
|
||||
|
||||
_lk.unlock();
|
||||
|
||||
// stop_top_level_timer_if_necessary();
|
||||
// Final resources cleanup
|
||||
rocprofsys::rocprofiler::rocm_cleanup();
|
||||
}
|
||||
|
||||
void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
{
|
||||
using ::rocprofiler::util::HsaRsrcFactory;
|
||||
|
||||
if(!config::get_use_rocprofiler() || config::get_rocm_events().empty()) return;
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n");
|
||||
|
||||
rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
|
||||
if(!_lk.owns_lock()) _lk.lock();
|
||||
|
||||
if(rocm::is_loaded)
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace,
|
||||
"rocprofiler is already loaded\n");
|
||||
return;
|
||||
}
|
||||
rocm::is_loaded = true;
|
||||
|
||||
_lk.unlock();
|
||||
|
||||
// Enable timestamping
|
||||
settings->timestamp_on = 1;
|
||||
settings->intercept_mode = 1;
|
||||
settings->hsa_intercepting = 1;
|
||||
settings->k_concurrent = 0;
|
||||
settings->obj_dumping = 0;
|
||||
// settings->code_obj_tracking = 0;
|
||||
// settings->memcopy_tracking = 0;
|
||||
// settings->trace_local = 1;
|
||||
// settings->opt_mode = 1;
|
||||
// settings->trace_size = 0;
|
||||
// settings->timeout = 0;
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "rocprofiler settings: %s\n",
|
||||
JOIN("", *settings).c_str());
|
||||
|
||||
// Initialize profiling
|
||||
rocprofsys::rocprofiler::rocm_initialize();
|
||||
HsaRsrcFactory::Instance().PrintGpuAgents("ROCm");
|
||||
}
|
||||
#endif
|
||||
|
||||
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
|
||||
const char* const* failed_tool_names)
|
||||
{
|
||||
tim::consume_parameters(table, runtime_version, failed_tool_count,
|
||||
failed_tool_names);
|
||||
|
||||
static bool _once = false;
|
||||
if(_once) return true;
|
||||
_once = true;
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n");
|
||||
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
|
||||
|
||||
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) return true;
|
||||
if(!tim::settings::enabled()) return true;
|
||||
|
||||
roctracer_is_init() = true;
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "Loading ROCm tooling...\n");
|
||||
|
||||
if(!config::settings_are_configured() && get_state() < State::Active)
|
||||
rocprofsys_init_tooling_hidden();
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION < 50300
|
||||
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
|
||||
"Computing the roctracer clock skew...\n");
|
||||
(void) rocprofsys::get_clock_skew();
|
||||
#endif
|
||||
|
||||
if(get_use_process_sampling() && get_use_rocm_smi())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
|
||||
"Setting rocm_smi state to active...\n");
|
||||
rocm_smi::set_state(State::Active);
|
||||
}
|
||||
|
||||
comp::roctracer::setup(static_cast<void*>(table), rocm::on_load_trace);
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
bool _force_rocprofiler_init =
|
||||
tim::get_env("ROCPROFSYS_FORCE_ROCPROFILER_INIT", false, false);
|
||||
#else
|
||||
bool _force_rocprofiler_init = false;
|
||||
#endif
|
||||
|
||||
bool _success = true;
|
||||
bool _is_empty =
|
||||
(config::settings_are_configured() && config::get_rocm_events().empty());
|
||||
if(_force_rocprofiler_init || (get_use_rocprofiler() && !_is_empty))
|
||||
{
|
||||
#if ROCPROFSYS_HIP_VERSION < 50500
|
||||
auto _rocprof = dynamic_library{
|
||||
"ROCPROFSYS_ROCPROFILER_LIBRARY",
|
||||
find_library_path(
|
||||
"librocprofiler64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
|
||||
{ ROCPROFSYS_DEFAULT_ROCM_PATH },
|
||||
{ "lib", "lib64", "rocprofiler/lib", "rocprofiler/lib64" }),
|
||||
(RTLD_LAZY | RTLD_GLOBAL), false
|
||||
};
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
|
||||
"Loading rocprofiler library (%s=%s)...\n",
|
||||
_rocprof.envname.c_str(), _rocprof.filename.c_str());
|
||||
_rocprof.open();
|
||||
|
||||
on_load_t _rocprof_load = nullptr;
|
||||
_success = _rocprof.invoke("OnLoad", _rocprof_load, table, runtime_version,
|
||||
failed_tool_count, failed_tool_names);
|
||||
ROCPROFSYS_CONDITIONAL_PRINT_F(!_success,
|
||||
"Warning! Invoking rocprofiler's OnLoad "
|
||||
"failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n",
|
||||
_rocprof.filename.c_str());
|
||||
ROCPROFSYS_CI_THROW(!_success,
|
||||
"Warning! Invoking rocprofiler's OnLoad "
|
||||
"failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n",
|
||||
_rocprof.filename.c_str());
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
using ::rocprofiler::util::HsaRsrcFactory;
|
||||
|
||||
HsaRsrcFactory::Instance().PrintGpuAgents("ROCm");
|
||||
}
|
||||
|
||||
gpu::add_hip_device_metadata();
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading... %s\n",
|
||||
(_success) ? "Done" : "Failed");
|
||||
return _success;
|
||||
}
|
||||
|
||||
// HSA-runtime on-unload method
|
||||
void OnUnload()
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n");
|
||||
rocprofsys_finalize_hidden();
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading... Done\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,36 +23,48 @@
|
||||
#pragma once
|
||||
|
||||
#include "core/defines.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
# include <rocprofiler.h>
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
# include <rocprofiler-sdk/registration.h>
|
||||
# include <rocprofiler-sdk/rocprofiler.h>
|
||||
#endif
|
||||
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocm
|
||||
{
|
||||
using lock_t = std::unique_lock<std::mutex>;
|
||||
using hardware_counter_info = ::tim::hardware_counters::info;
|
||||
|
||||
extern std::mutex rocm_mutex;
|
||||
extern bool is_loaded;
|
||||
std::vector<hardware_counter_info>
|
||||
rocm_events();
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
|
||||
inline std::vector<hardware_counter_info>
|
||||
rocm_events()
|
||||
{
|
||||
return std::vector<hardware_counter_info>();
|
||||
}
|
||||
#endif
|
||||
} // namespace rocm
|
||||
} // namespace rocprofsys
|
||||
|
||||
extern "C"
|
||||
{
|
||||
struct HsaApiTable;
|
||||
using on_load_t = bool (*)(HsaApiTable*, uint64_t, uint64_t, const char* const*);
|
||||
struct rocprofiler_tool_configure_result_t;
|
||||
struct rocprofiler_client_id_t;
|
||||
|
||||
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
|
||||
const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API;
|
||||
void OnUnload() ROCPROFSYS_PUBLIC_API;
|
||||
using rocprofiler_configure_t =
|
||||
rocprofiler_tool_configure_result_t* (*) (uint32_t version,
|
||||
const char* runtime_version,
|
||||
uint32_t priority,
|
||||
rocprofiler_client_id_t* client_id);
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
|
||||
void OnLoadToolProp(rocprofiler_settings_t* settings) ROCPROFSYS_PUBLIC_API;
|
||||
void OnUnloadTool() ROCPROFSYS_PUBLIC_API;
|
||||
#endif
|
||||
rocprofiler_tool_configure_result_t* rocprofiler_configure(
|
||||
uint32_t version, const char* runtime_version, uint32_t priority,
|
||||
rocprofiler_client_id_t* client_id) ROCPROFSYS_PUBLIC_API;
|
||||
}
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
#
|
||||
if(ROCPROFSYS_USE_ROCPROFILER OR ROCPROFSYS_USE_ROCTRACER)
|
||||
target_sources(
|
||||
rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.cpp)
|
||||
endif()
|
||||
ファイル差分が大きすぎるため省略します
差分を読み込み
@@ -1,582 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/exception.hpp"
|
||||
|
||||
#define AMD_INTERNAL_BUILD 1
|
||||
|
||||
#include <hsa.h>
|
||||
#include <hsa_api_trace.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <hsa_ext_finalize.h>
|
||||
#include <hsa_ven_amd_aqlprofile.h>
|
||||
#include <hsa_ven_amd_loader.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define HSA_ARGUMENT_ALIGN_BYTES 16
|
||||
#define HSA_QUEUE_ALIGN_BYTES 64
|
||||
#define HSA_PACKET_ALIGN_BYTES 64
|
||||
#define HSA_MESSAGE_LENGTH 4096
|
||||
|
||||
#define CHECK_STATUS(msg, status) \
|
||||
do \
|
||||
{ \
|
||||
if((status) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
char _buffer[HSA_MESSAGE_LENGTH]; \
|
||||
snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \
|
||||
emsg ? emsg : "<unknown error>"); \
|
||||
throw ::rocprofsys::exception<std::runtime_error>(_buffer); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CHECK_ITER_STATUS(msg, status) \
|
||||
do \
|
||||
{ \
|
||||
if((status) != HSA_STATUS_INFO_BREAK) \
|
||||
{ \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
char _buffer[HSA_MESSAGE_LENGTH]; \
|
||||
snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \
|
||||
emsg ? emsg : "<unknown error>"); \
|
||||
throw ::rocprofsys::exception<std::runtime_error>(_buffer); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace util
|
||||
{
|
||||
static const size_t MEM_PAGE_BYTES = 0x1000;
|
||||
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
|
||||
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
|
||||
|
||||
struct hsa_pfn_t
|
||||
{
|
||||
decltype(::hsa_init)* hsa_init;
|
||||
decltype(::hsa_shut_down)* hsa_shut_down;
|
||||
decltype(::hsa_agent_get_info)* hsa_agent_get_info;
|
||||
decltype(::hsa_iterate_agents)* hsa_iterate_agents;
|
||||
|
||||
decltype(::hsa_queue_create)* hsa_queue_create;
|
||||
decltype(::hsa_queue_destroy)* hsa_queue_destroy;
|
||||
decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
|
||||
decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
|
||||
decltype(
|
||||
::hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
|
||||
|
||||
decltype(::hsa_signal_create)* hsa_signal_create;
|
||||
decltype(::hsa_signal_destroy)* hsa_signal_destroy;
|
||||
decltype(::hsa_signal_load_relaxed)* hsa_signal_load_relaxed;
|
||||
decltype(::hsa_signal_store_relaxed)* hsa_signal_store_relaxed;
|
||||
decltype(::hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire;
|
||||
decltype(::hsa_signal_store_screlease)* hsa_signal_store_screlease;
|
||||
|
||||
decltype(::hsa_code_object_reader_create_from_file)*
|
||||
hsa_code_object_reader_create_from_file;
|
||||
decltype(::hsa_executable_create_alt)* hsa_executable_create_alt;
|
||||
decltype(
|
||||
::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
|
||||
decltype(::hsa_executable_freeze)* hsa_executable_freeze;
|
||||
decltype(::hsa_executable_destroy)* hsa_executable_destroy;
|
||||
decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol;
|
||||
decltype(::hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
|
||||
decltype(::hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
|
||||
|
||||
decltype(::hsa_system_get_info)* hsa_system_get_info;
|
||||
decltype(
|
||||
::hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
|
||||
|
||||
decltype(::hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools;
|
||||
decltype(::hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info;
|
||||
decltype(::hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate;
|
||||
decltype(::hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access;
|
||||
decltype(::hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy;
|
||||
|
||||
decltype(::hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler;
|
||||
decltype(
|
||||
::hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled;
|
||||
decltype(
|
||||
::hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time;
|
||||
decltype(::hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time;
|
||||
};
|
||||
|
||||
// Encapsulates information about a Hsa Agent such as its
|
||||
// handle, name, max queue size, max wavefront size, etc.
|
||||
struct AgentInfo
|
||||
{
|
||||
// Handle of Agent
|
||||
hsa_agent_t dev_id;
|
||||
|
||||
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
|
||||
uint32_t dev_type;
|
||||
|
||||
// APU flag
|
||||
bool is_apu;
|
||||
|
||||
// Agent system index
|
||||
uint32_t dev_index;
|
||||
|
||||
// GFXIP name
|
||||
char gfxip[64];
|
||||
|
||||
// Name of Agent whose length is less than 64
|
||||
char name[64];
|
||||
|
||||
// Max size of Wavefront size
|
||||
uint32_t max_wave_size;
|
||||
|
||||
// Max size of Queue buffer
|
||||
uint32_t max_queue_size;
|
||||
|
||||
// Hsail profile supported by agent
|
||||
hsa_profile_t profile;
|
||||
|
||||
// CPU/GPU/kern-arg memory pools
|
||||
hsa_amd_memory_pool_t cpu_pool;
|
||||
hsa_amd_memory_pool_t gpu_pool;
|
||||
hsa_amd_memory_pool_t kern_arg_pool;
|
||||
|
||||
// The number of compute unit available in the agent.
|
||||
uint32_t cu_num;
|
||||
|
||||
// Maximum number of waves possible in a Compute Unit.
|
||||
uint32_t waves_per_cu;
|
||||
|
||||
// Number of SIMD's per compute unit CU
|
||||
uint32_t simds_per_cu;
|
||||
|
||||
// Number of Shader Engines (SE) in Gpu
|
||||
uint32_t se_num;
|
||||
|
||||
// Number of Shader Arrays Per Shader Engines in Gpu
|
||||
uint32_t shader_arrays_per_se;
|
||||
|
||||
// SGPR/VGPR/LDS block sizes
|
||||
uint32_t sgpr_block_dflt;
|
||||
uint32_t sgpr_block_size;
|
||||
uint32_t vgpr_block_size;
|
||||
static const uint32_t lds_block_size = 128 * 4;
|
||||
};
|
||||
|
||||
// HSA timer class
|
||||
// Provides current HSA timestampa and system-clock/ns conversion API
|
||||
class HsaTimer
|
||||
{
|
||||
public:
|
||||
typedef uint64_t timestamp_t;
|
||||
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
|
||||
typedef long double freq_t;
|
||||
|
||||
enum time_id_t
|
||||
{
|
||||
TIME_ID_CLOCK_REALTIME = 0,
|
||||
TIME_ID_CLOCK_REALTIME_COARSE = 1,
|
||||
TIME_ID_CLOCK_MONOTONIC = 2,
|
||||
TIME_ID_CLOCK_MONOTONIC_COARSE = 3,
|
||||
TIME_ID_CLOCK_MONOTONIC_RAW = 4,
|
||||
TIME_ID_NUMBER
|
||||
};
|
||||
|
||||
HsaTimer(const hsa_pfn_t* hsa_api)
|
||||
: hsa_api_(hsa_api)
|
||||
{
|
||||
timestamp_t sysclock_hz = 0;
|
||||
hsa_status_t status = hsa_api_->hsa_system_get_info(
|
||||
HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
|
||||
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
|
||||
sysclock_factor_ = (freq_t) 1000000000 / (freq_t) sysclock_hz;
|
||||
}
|
||||
|
||||
// Methods for system-clock/ns conversion
|
||||
timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const
|
||||
{
|
||||
return timestamp_t((freq_t) sysclock * sysclock_factor_);
|
||||
}
|
||||
timestamp_t ns_to_sysclock(const timestamp_t& time) const
|
||||
{
|
||||
return timestamp_t((freq_t) time / sysclock_factor_);
|
||||
}
|
||||
|
||||
// Method for timespec/ns conversion
|
||||
static timestamp_t timespec_to_ns(const timespec& time)
|
||||
{
|
||||
return ((timestamp_t) time.tv_sec * 1000000000) + time.tv_nsec;
|
||||
}
|
||||
|
||||
// Return timestamp in 'ns'
|
||||
timestamp_t timestamp_ns() const
|
||||
{
|
||||
timestamp_t sysclock;
|
||||
hsa_status_t status =
|
||||
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
|
||||
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status);
|
||||
return sysclock_to_ns(sysclock);
|
||||
}
|
||||
|
||||
// Return time in 'ns'
|
||||
timestamp_t clocktime_ns(clockid_t clock_id) const
|
||||
{
|
||||
timespec time;
|
||||
clock_gettime(clock_id, &time);
|
||||
return timespec_to_ns(time);
|
||||
}
|
||||
|
||||
// Return pair of correlated values of profiling timestamp and time with
|
||||
// correlation error for a given time ID and number of iterations
|
||||
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
|
||||
timestamp_t* time_v, timestamp_t* error_v)
|
||||
{
|
||||
clockid_t clock_id = 0;
|
||||
switch(time_id)
|
||||
{
|
||||
case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break;
|
||||
case TIME_ID_CLOCK_REALTIME_COARSE: clock_id = CLOCK_REALTIME_COARSE; break;
|
||||
case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break;
|
||||
case TIME_ID_CLOCK_MONOTONIC_COARSE: clock_id = CLOCK_MONOTONIC_COARSE; break;
|
||||
case TIME_ID_CLOCK_MONOTONIC_RAW: clock_id = CLOCK_MONOTONIC_RAW; break;
|
||||
default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR);
|
||||
}
|
||||
|
||||
std::vector<timestamp_t> ts_vec(iters);
|
||||
std::vector<timespec> tm_vec(iters);
|
||||
const uint32_t steps = iters - 1;
|
||||
|
||||
for(uint32_t i = 0; i < iters; ++i)
|
||||
{
|
||||
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]);
|
||||
clock_gettime(clock_id, &tm_vec[i]);
|
||||
}
|
||||
|
||||
const timestamp_t ts_base = sysclock_to_ns(ts_vec.front());
|
||||
const timestamp_t tm_base = timespec_to_ns(tm_vec.front());
|
||||
const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps);
|
||||
|
||||
timestamp_t ts_accum = 0;
|
||||
timestamp_t tm_accum = 0;
|
||||
for(uint32_t i = 0; i < iters; ++i)
|
||||
{
|
||||
ts_accum += (ts_vec[i] - ts_base);
|
||||
tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base);
|
||||
}
|
||||
|
||||
*timestamp_v = (ts_accum / iters) + ts_base + error;
|
||||
*time_v = (tm_accum / iters) + tm_base;
|
||||
*error_v = error;
|
||||
}
|
||||
|
||||
private:
|
||||
// Timestamp frequency factor
|
||||
freq_t sysclock_factor_;
|
||||
// HSA API table
|
||||
const hsa_pfn_t* const hsa_api_;
|
||||
};
|
||||
|
||||
class HsaRsrcFactory
|
||||
{
|
||||
public:
|
||||
static const size_t CMD_SLOT_SIZE_B = 0x40;
|
||||
typedef std::recursive_mutex mutex_t;
|
||||
typedef HsaTimer::timestamp_t timestamp_t;
|
||||
|
||||
static HsaRsrcFactory* Create(bool initialize_hsa = true)
|
||||
{
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed);
|
||||
if(obj == nullptr)
|
||||
{
|
||||
obj = new HsaRsrcFactory(initialize_hsa);
|
||||
instance_.store(obj, std::memory_order_release);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
static HsaRsrcFactory& Instance()
|
||||
{
|
||||
HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire);
|
||||
if(obj == nullptr) obj = Create(false);
|
||||
hsa_status_t status = (obj != nullptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
|
||||
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
|
||||
return *obj;
|
||||
}
|
||||
|
||||
static void Destroy()
|
||||
{
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if(instance_) delete instance_.load();
|
||||
instance_ = nullptr;
|
||||
}
|
||||
|
||||
// Return system agent info
|
||||
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
|
||||
|
||||
// Get the count of Hsa Gpu Agents available on the platform
|
||||
// @return uint32_t Number of Gpu agents on platform
|
||||
uint32_t GetCountOfGpuAgents();
|
||||
|
||||
// Get the count of Hsa Cpu Agents available on the platform
|
||||
// @return uint32_t Number of Cpu agents on platform
|
||||
uint32_t GetCountOfCpuAgents();
|
||||
|
||||
// Get the AgentInfo handle of a Gpu device
|
||||
// @param idx Gpu Agent at specified index
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
// @return bool true if successful, false otherwise
|
||||
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
|
||||
|
||||
// Get the AgentInfo handle of a Cpu device
|
||||
// @param idx Cpu Agent at specified index
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
// @return bool true if successful, false otherwise
|
||||
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
|
||||
|
||||
// Create a Queue object and return its handle. The queue object is expected
|
||||
// to support user requested number of Aql dispatch packets.
|
||||
// @param agent_info Gpu Agent on which to create a queue object
|
||||
// @param num_Pkts Number of packets to be held by queue
|
||||
// @param queue Output parameter updated with handle of queue object
|
||||
// @return bool true if successful, false otherwise
|
||||
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
|
||||
|
||||
// Create a Signal object and return its handle.
|
||||
// @param value Initial value of signal object
|
||||
// @param signal Output parameter updated with handle of signal object
|
||||
// @return bool true if successful, false otherwise
|
||||
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
|
||||
|
||||
// Allocate local GPU memory
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Allocate memory tp pass kernel parameters
|
||||
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo
|
||||
// parameter.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Allocate system memory accessible from both CPU and GPU
|
||||
// Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Allocate memory for command buffer.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Wait signal
|
||||
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
|
||||
const hsa_signal_value_t& signal_value) const;
|
||||
|
||||
// Wait signal with signal value restore
|
||||
void SignalWaitRestore(const hsa_signal_t& signal,
|
||||
const hsa_signal_value_t& signal_value) const;
|
||||
|
||||
// Copy data from GPU to host memory
|
||||
bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
|
||||
bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
|
||||
|
||||
// Memory free method
|
||||
static bool FreeMemory(void* ptr);
|
||||
|
||||
// Loads an Assembled Brig file and Finalizes it into Device Isa
|
||||
// @param agent_info Gpu device for which to finalize
|
||||
// @param brig_path File path of the Assembled Brig file
|
||||
// @param kernel_name Name of the kernel to finalize
|
||||
// @param code_desc Handle of finalized Code Descriptor that could
|
||||
// be used to submit for execution
|
||||
// @return true if successful, false otherwise
|
||||
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
|
||||
const char* kernel_name, hsa_executable_t* hsa_exec,
|
||||
hsa_executable_symbol_t* code_desc);
|
||||
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool PrintGpuAgents(const std::string& header);
|
||||
|
||||
// Utils for submitting AQL packet to a given queue
|
||||
static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx);
|
||||
static void* GetReadPointer(hsa_queue_t* queue);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
|
||||
|
||||
// Enable executables loading tracking
|
||||
static bool IsExecutableTracking() { return executable_tracking_on_; }
|
||||
static void EnableExecutableTracking(HsaApiTable* table);
|
||||
static const char* GetKernelNameRef(uint64_t addr);
|
||||
|
||||
// Initialize HSA API table
|
||||
void static InitHsaApiTable(HsaApiTable* table);
|
||||
static const hsa_pfn_t* HsaApi() { return &hsa_api_; }
|
||||
|
||||
// Return AqlProfile API table
|
||||
typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t;
|
||||
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
|
||||
|
||||
// Return Loader API table
|
||||
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
|
||||
|
||||
// Methods for system-clock/ns conversion and timestamp in 'ns'
|
||||
timestamp_t SysclockToNs(const timestamp_t& sysclock) const
|
||||
{
|
||||
return timer_->sysclock_to_ns(sysclock);
|
||||
}
|
||||
timestamp_t NsToSysclock(const timestamp_t& time) const
|
||||
{
|
||||
return timer_->ns_to_sysclock(time);
|
||||
}
|
||||
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
|
||||
|
||||
timestamp_t GetSysTimeout() const { return timeout_; }
|
||||
static timestamp_t GetTimeoutNs() { return timeout_ns_; }
|
||||
static void SetTimeoutNs(const timestamp_t& time)
|
||||
{
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
timeout_ns_ = time;
|
||||
if(instance_ != nullptr)
|
||||
Instance().timeout_ = Instance().timer_->ns_to_sysclock(time);
|
||||
}
|
||||
|
||||
void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters)
|
||||
{
|
||||
timestamp_t timestamp_v = 0;
|
||||
timestamp_t time_v = 0;
|
||||
timestamp_t error_v = 0;
|
||||
timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v);
|
||||
time_shift_[time_id] = time_v - timestamp_v;
|
||||
time_error_[time_id] = error_v;
|
||||
}
|
||||
|
||||
hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value)
|
||||
{
|
||||
if(time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR;
|
||||
*time_value = time_stamp + time_shift_[time_id];
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err)
|
||||
{
|
||||
*err = time_error_[time_id];
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
private:
|
||||
// System agents iterating callback
|
||||
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
|
||||
|
||||
// Callback function to find and bind kernarg region of an agent
|
||||
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
|
||||
|
||||
// Load AQL profile HSA extension library directly
|
||||
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
|
||||
|
||||
// Constructor of the class. Will initialize the Hsa Runtime and
|
||||
// query the system topology to get the list of Cpu and Gpu devices
|
||||
explicit HsaRsrcFactory(bool initialize_hsa);
|
||||
|
||||
// Destructor of the class
|
||||
~HsaRsrcFactory();
|
||||
|
||||
// Add an instance of AgentInfo representing a Hsa Gpu agent
|
||||
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
|
||||
|
||||
// To mmap command buffer memory
|
||||
static const bool CMD_MEMORY_MMAP = false;
|
||||
|
||||
// HSA was initialized
|
||||
const bool initialize_hsa_;
|
||||
|
||||
static std::atomic<HsaRsrcFactory*> instance_;
|
||||
static mutex_t mutex_;
|
||||
|
||||
// Used to maintain a list of Hsa Gpu Agent Info
|
||||
std::vector<const AgentInfo*> gpu_list_;
|
||||
std::vector<hsa_agent_t> gpu_agents_;
|
||||
|
||||
// Used to maintain a list of Hsa Cpu Agent Info
|
||||
std::vector<const AgentInfo*> cpu_list_;
|
||||
std::vector<hsa_agent_t> cpu_agents_;
|
||||
|
||||
// System agents map
|
||||
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
|
||||
|
||||
// Executables loading tracking
|
||||
typedef std::map<uint64_t, const char*> symbols_map_t;
|
||||
static symbols_map_t* symbols_map_;
|
||||
static bool executable_tracking_on_;
|
||||
static void* to_dump_code_obj_;
|
||||
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
|
||||
const char* options);
|
||||
static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable);
|
||||
static hsa_status_t executable_symbols_cb(hsa_executable_t exec,
|
||||
hsa_executable_symbol_t symbol, void* data);
|
||||
|
||||
// HSA runtime API table
|
||||
static hsa_pfn_t hsa_api_;
|
||||
|
||||
// AqlProfile API table
|
||||
aqlprofile_pfn_t aqlprofile_api_;
|
||||
|
||||
// Loader API table
|
||||
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
|
||||
|
||||
// System timeout, ns
|
||||
static timestamp_t timeout_ns_;
|
||||
// System timeout, sysclock
|
||||
timestamp_t timeout_;
|
||||
|
||||
// HSA timer
|
||||
HsaTimer* timer_;
|
||||
|
||||
// Time shift array to support time conversion
|
||||
timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER];
|
||||
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
|
||||
|
||||
// CPU/kern-arg memory pools
|
||||
hsa_amd_memory_pool_t* cpu_pool_;
|
||||
hsa_amd_memory_pool_t* kern_arg_pool_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace rocprofiler
|
||||
@@ -128,7 +128,8 @@ private:
|
||||
static bool shutdown();
|
||||
};
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
|
||||
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
|
||||
|
||||
inline void
|
||||
setup()
|
||||
{}
|
||||
@@ -154,7 +155,7 @@ inline void set_state(State) {}
|
||||
} // namespace rocm_smi
|
||||
} // namespace rocprofsys
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
|
||||
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
|
||||
# if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
|
||||
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
|
||||
|
||||
|
||||
ファイル差分が大きすぎるため省略します
差分を読み込み
+17
-45
@@ -1,6 +1,6 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -22,67 +22,39 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/defines.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/components/rocprofiler.hpp"
|
||||
|
||||
#include <timemory/backends/hardware_counters.hpp>
|
||||
#include <timemory/macros.hpp>
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/mpl/macros.hpp>
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <tuple>
|
||||
#include <unistd.h>
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
std::map<uint32_t, std::vector<std::string_view>>
|
||||
get_data_labels();
|
||||
using hardware_counter_info = ::tim::hardware_counters::info;
|
||||
|
||||
void
|
||||
rocm_initialize();
|
||||
setup();
|
||||
|
||||
void
|
||||
rocm_cleanup();
|
||||
shutdown();
|
||||
|
||||
bool&
|
||||
is_setup();
|
||||
void
|
||||
config();
|
||||
|
||||
void
|
||||
post_process();
|
||||
|
||||
std::vector<component::rocm_info_entry>
|
||||
rocm_metrics();
|
||||
void
|
||||
sample();
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0
|
||||
inline void
|
||||
post_process()
|
||||
{}
|
||||
void
|
||||
start();
|
||||
|
||||
inline void
|
||||
rocm_cleanup()
|
||||
{}
|
||||
void
|
||||
stop();
|
||||
|
||||
inline std::vector<component::rocm_info_entry>
|
||||
rocm_metrics()
|
||||
{
|
||||
return std::vector<component::rocm_info_entry>{};
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace rocprofiler
|
||||
std::vector<hardware_counter_info>
|
||||
get_rocm_events_info();
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
+9
@@ -0,0 +1,9 @@
|
||||
#
|
||||
set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/fwd.cpp)
|
||||
|
||||
set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/fwd.hpp)
|
||||
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources}
|
||||
${rocprofiler_sdk_headers})
|
||||
+135
@@ -0,0 +1,135 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/rocprofiler-sdk/counters.hpp"
|
||||
#include "common/synchronized.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/rocprofiler-sdk/fwd.hpp"
|
||||
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
#include <rocprofiler-sdk/dispatch_counting_service.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
namespace
|
||||
{
|
||||
std::string
|
||||
get_counter_description(const client_data* tool_data, std::string_view _v)
|
||||
{
|
||||
const auto& _info = tool_data->events_info;
|
||||
for(const auto& itr : _info)
|
||||
{
|
||||
if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0)
|
||||
{
|
||||
return itr.long_description();
|
||||
}
|
||||
}
|
||||
return std::string{};
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
counter_event::operator()(const client_data* tool_data, ::perfetto::CounterTrack* _track,
|
||||
timing_interval _timing, scope::config _scope) const
|
||||
{
|
||||
if(!record.dispatch_data) return;
|
||||
|
||||
const auto& _dispatch_info = record.dispatch_data->dispatch_info;
|
||||
const auto* _kern_sym_data =
|
||||
tool_data->get_kernel_symbol_info(_dispatch_info.kernel_id);
|
||||
|
||||
auto _bundle = counter_bundle_t{ tim::demangle(_kern_sym_data->kernel_name), _scope };
|
||||
|
||||
_bundle.push(_dispatch_info.queue_id.handle)
|
||||
.start()
|
||||
.store(record.record_counter.counter_value);
|
||||
|
||||
_bundle.stop().pop(_dispatch_info.queue_id.handle);
|
||||
|
||||
if(_track && _timing.start > 0 && _timing.end > _timing.start)
|
||||
{
|
||||
TRACE_COUNTER(trait::name<category::rocm_counter_collection>::value, *_track,
|
||||
_timing.start, record.record_counter.counter_value);
|
||||
TRACE_COUNTER(trait::name<category::rocm_counter_collection>::value, *_track,
|
||||
_timing.end, 0);
|
||||
}
|
||||
}
|
||||
|
||||
counter_storage::counter_storage(const client_data* _tool_data, uint64_t _devid,
|
||||
size_t _idx, std::string_view _name)
|
||||
: tool_data{ _tool_data }
|
||||
, device_id{ _devid }
|
||||
, index{ static_cast<int64_t>(_idx) }
|
||||
, metric_name{ _name }
|
||||
, metric_description{ get_counter_description(_tool_data, metric_name) }
|
||||
{
|
||||
auto _metric_name = std::string{ _name };
|
||||
_metric_name =
|
||||
std::regex_replace(_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2");
|
||||
storage_name = JOIN('-', "rocprof", "device", device_id, _metric_name);
|
||||
storage = std::make_unique<counter_storage_type>(tim::standalone_storage{}, index,
|
||||
storage_name);
|
||||
{
|
||||
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_COUNT;
|
||||
track_name = JOIN(" ", "GPU", _metric_name, JOIN("", '[', device_id, ']'));
|
||||
track = std::make_unique<counter_track_type>(
|
||||
::perfetto::StaticString(track_name.c_str()));
|
||||
track->set_is_incremental(false);
|
||||
track->set_unit(_unit);
|
||||
track->set_unit_multiplier(1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
counter_storage::operator()(const counter_event& _event, timing_interval _timing,
|
||||
scope::config _scope) const
|
||||
{
|
||||
operation::set_storage<counter_data_tracker>{}(storage.get());
|
||||
_event(tool_data, track.get(), _timing, _scope);
|
||||
}
|
||||
|
||||
void
|
||||
counter_storage::write() const
|
||||
{
|
||||
operation::set_storage<counter_data_tracker>{}(storage.get());
|
||||
counter_data_tracker::label() = metric_name;
|
||||
counter_data_tracker::description() = metric_description;
|
||||
storage->write();
|
||||
}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/synchronized.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/rocprofiler-sdk/fwd.hpp"
|
||||
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
#include <rocprofiler-sdk/dispatch_counting_service.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
struct counter_dispatch_record
|
||||
{
|
||||
const rocprofiler_dispatch_counting_service_data_t* dispatch_data = nullptr;
|
||||
rocprofiler_dispatch_id_t dispatch_id = 0;
|
||||
rocprofiler_counter_id_t counter_id = {};
|
||||
rocprofiler_record_counter_t record_counter = {};
|
||||
};
|
||||
|
||||
struct counter_data_tag
|
||||
{};
|
||||
|
||||
using counter_data_tracker = component::data_tracker<double, counter_data_tag>;
|
||||
using counter_storage_type = typename counter_data_tracker::storage_type;
|
||||
using counter_bundle_t = tim::lightweight_tuple<counter_data_tracker>;
|
||||
using counter_track_type = ::perfetto::CounterTrack;
|
||||
|
||||
struct counter_event
|
||||
{
|
||||
ROCPROFSYS_DEFAULT_OBJECT(counter_event)
|
||||
|
||||
explicit counter_event(counter_dispatch_record&& _v)
|
||||
: record{ _v }
|
||||
{}
|
||||
|
||||
void operator()(const client_data* tool_data, counter_track_type*,
|
||||
timing_interval _timing, scope::config _scope) const;
|
||||
|
||||
counter_dispatch_record record = {};
|
||||
};
|
||||
|
||||
struct counter_storage
|
||||
{
|
||||
const client_data* tool_data = nullptr;
|
||||
uint64_t device_id = 0;
|
||||
int64_t index = 0;
|
||||
std::string metric_name = {};
|
||||
std::string metric_description = {};
|
||||
std::string storage_name = {};
|
||||
std::string track_name = {};
|
||||
std::unique_ptr<counter_storage_type> storage = {};
|
||||
std::unique_ptr<counter_track_type> track = {};
|
||||
|
||||
counter_storage(const client_data* _tool_data, uint64_t _devid, size_t _idx,
|
||||
std::string_view _name);
|
||||
|
||||
~counter_storage() = default;
|
||||
counter_storage(const counter_storage&) = delete;
|
||||
counter_storage(counter_storage&&) = default;
|
||||
counter_storage& operator=(const counter_storage&) = delete;
|
||||
counter_storage& operator=(counter_storage&&) = default;
|
||||
|
||||
friend bool operator<(const counter_storage& lhs, const counter_storage& rhs)
|
||||
{
|
||||
return std::tie(lhs.storage_name, lhs.device_id, lhs.index) <
|
||||
std::tie(rhs.storage_name, rhs.device_id, rhs.index);
|
||||
}
|
||||
|
||||
void operator()(const counter_event& _event, timing_interval _timing,
|
||||
scope::config _scope = scope::get_default()) const;
|
||||
|
||||
void write() const;
|
||||
};
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace operation
|
||||
{
|
||||
template <>
|
||||
struct set_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker>
|
||||
{
|
||||
static constexpr size_t max_threads = 4096;
|
||||
using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker;
|
||||
using storage_array_t = std::array<storage<type>*, max_threads>;
|
||||
friend struct get_storage<rocprofsys::rocprofiler_sdk::counter_data_tracker>;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(set_storage)
|
||||
|
||||
auto operator()(storage<type>* _v, size_t _idx) const { get().at(_idx) = _v; }
|
||||
auto operator()(type&, size_t) const {}
|
||||
auto operator()(storage<type>* _v) const { get().fill(_v); }
|
||||
|
||||
private:
|
||||
static storage_array_t& get()
|
||||
{
|
||||
static storage_array_t _v = { nullptr };
|
||||
return _v;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker>
|
||||
{
|
||||
using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(get_storage)
|
||||
|
||||
auto operator()(const type&) const
|
||||
{
|
||||
return operation::set_storage<type>::get().at(0);
|
||||
}
|
||||
|
||||
auto operator()() const
|
||||
{
|
||||
type _obj{};
|
||||
return (*this)(_obj);
|
||||
}
|
||||
|
||||
auto operator()(size_t _idx) const
|
||||
{
|
||||
return operation::set_storage<type>::get().at(_idx);
|
||||
}
|
||||
|
||||
auto operator()(type&, size_t _idx) const { return (*this)(_idx); }
|
||||
};
|
||||
} // namespace operation
|
||||
} // namespace tim
|
||||
@@ -0,0 +1,270 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/rocprofiler-sdk/fwd.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/state.hpp"
|
||||
|
||||
#include <timemory/utility/join.hpp>
|
||||
|
||||
#include <exception>
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using tool_agent_vec_t = std::vector<tool_agent>;
|
||||
|
||||
rocprofiler_status_t
|
||||
dimensions_info_callback(rocprofiler_counter_id_t /*id*/,
|
||||
const rocprofiler_record_dimension_info_t* dim_info,
|
||||
long unsigned int num_dims, void* user_data)
|
||||
{
|
||||
auto* dimensions_info =
|
||||
static_cast<std::vector<rocprofiler_record_dimension_info_t>*>(user_data);
|
||||
dimensions_info->reserve(num_dims);
|
||||
for(size_t j = 0; j < num_dims; j++)
|
||||
dimensions_info->emplace_back(dim_info[j]);
|
||||
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
counters_supported_callback(rocprofiler_agent_id_t agent_id,
|
||||
rocprofiler_counter_id_t* counters, size_t num_counters,
|
||||
void* user_data)
|
||||
{
|
||||
using value_type = typename agent_counter_info_map_t::mapped_type;
|
||||
|
||||
auto* data_v = static_cast<agent_counter_info_map_t*>(user_data);
|
||||
data_v->emplace(agent_id, value_type{});
|
||||
for(size_t i = 0; i < num_counters; ++i)
|
||||
{
|
||||
auto _info = rocprofiler_counter_info_v0_t{};
|
||||
auto _dim_info = std::vector<rocprofiler_record_dimension_info_t>{};
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_query_counter_info(
|
||||
counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &_info));
|
||||
|
||||
// populate local vector
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_counter_dimensions(
|
||||
counters[i], dimensions_info_callback, &_dim_info));
|
||||
|
||||
if(!_info.is_constant)
|
||||
data_v->at(agent_id).emplace_back(agent_id, _info, std::move(_dim_info));
|
||||
}
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
agent_counter_info_map_t
|
||||
get_agent_counter_info(const tool_agent_vec_t& _agents)
|
||||
{
|
||||
auto _data = agent_counter_info_map_t{};
|
||||
|
||||
for(auto itr : _agents)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
itr.agent->id, counters_supported_callback, &_data));
|
||||
|
||||
std::sort(_data.at(itr.agent->id).begin(), _data.at(itr.agent->id).end(),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return (lhs.id.handle < rhs.id.handle);
|
||||
});
|
||||
|
||||
for(auto& citr : _data.at(itr.agent->id))
|
||||
{
|
||||
std::sort(citr.dimension_info.begin(), citr.dimension_info.end(),
|
||||
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
|
||||
}
|
||||
}
|
||||
|
||||
return _data;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
rocprofiler_tool_counter_info_t::rocprofiler_tool_counter_info_t(
|
||||
rocprofiler_agent_id_t _agent_id, parent_type _info, dimension_info_vec_t&& _dim_info)
|
||||
: parent_type{ _info }
|
||||
, agent_id{ _agent_id }
|
||||
, dimension_info{ std::move(_dim_info) }
|
||||
{}
|
||||
|
||||
void
|
||||
client_data::initialize()
|
||||
{
|
||||
buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
|
||||
callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
|
||||
|
||||
static constexpr auto supported_agent_info_version = ROCPROFILER_AGENT_INFO_VERSION_0;
|
||||
|
||||
rocprofiler_query_available_agents_cb_t iterate_cb =
|
||||
[](rocprofiler_agent_version_t version, const void** agents_arr,
|
||||
size_t num_agents, void* user_data) {
|
||||
ROCPROFSYS_CONDITIONAL_ABORT(version != supported_agent_info_version,
|
||||
"rocprofiler agent info version != expected "
|
||||
"agent info version (=%i). value: %i\n",
|
||||
supported_agent_info_version, version);
|
||||
|
||||
auto _agents_v = std::vector<rocprofiler_agent_v0_t>{};
|
||||
for(size_t i = 0; i < num_agents; ++i)
|
||||
{
|
||||
const auto* _agent =
|
||||
static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
|
||||
_agents_v.emplace_back(*_agent);
|
||||
}
|
||||
|
||||
auto* tool_data_v = as_client_data(user_data);
|
||||
tool_data_v->set_agents(std::move(_agents_v));
|
||||
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_query_available_agents(
|
||||
supported_agent_info_version, iterate_cb, sizeof(rocprofiler_agent_t), this));
|
||||
}
|
||||
|
||||
void
|
||||
client_data::initialize_event_info()
|
||||
{
|
||||
if(agents.empty()) initialize();
|
||||
|
||||
if(agent_counter_info.size() != gpu_agents.size())
|
||||
agent_counter_info = get_agent_counter_info(gpu_agents);
|
||||
|
||||
try
|
||||
{
|
||||
using qualifier_t = tim::hardware_counters::qualifier;
|
||||
using qualifier_vec_t = std::vector<qualifier_t>;
|
||||
|
||||
for(const auto& aitr : gpu_agents)
|
||||
{
|
||||
auto _dev_index = aitr.device_id;
|
||||
auto _device_qualifier_sym = JOIN("", ":device=", _dev_index);
|
||||
auto _device_qualifier =
|
||||
tim::hardware_counters::qualifier{ true, static_cast<int>(_dev_index),
|
||||
_device_qualifier_sym,
|
||||
JOIN(" ", "Device", _dev_index) };
|
||||
|
||||
auto _counter_info = agent_counter_info.at(aitr.agent->id);
|
||||
std::sort(_counter_info.begin(), _counter_info.end(),
|
||||
[](const rocprofiler_tool_counter_info_t& lhs,
|
||||
const rocprofiler_tool_counter_info_t& rhs) {
|
||||
if(lhs.is_constant && rhs.is_constant)
|
||||
return lhs.id < rhs.id;
|
||||
else if(lhs.is_constant)
|
||||
return true;
|
||||
else if(rhs.is_constant)
|
||||
return false;
|
||||
|
||||
if(!lhs.is_derived && !rhs.is_derived)
|
||||
return lhs.id < rhs.id;
|
||||
else if(!lhs.is_derived)
|
||||
return true;
|
||||
else if(!rhs.is_derived)
|
||||
return false;
|
||||
|
||||
return lhs.id < rhs.id;
|
||||
});
|
||||
|
||||
for(const auto& ditr : _counter_info)
|
||||
{
|
||||
auto _long_desc = std::string{ ditr.description };
|
||||
auto _units = std::string{};
|
||||
auto _pysym = std::string{};
|
||||
if(ditr.is_constant)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else if(ditr.is_derived)
|
||||
{
|
||||
auto _sym = JOIN("", ditr.name, _device_qualifier_sym);
|
||||
auto _short_desc = JOIN("", "Derived counter: ", ditr.expression);
|
||||
events_info.emplace_back(hardware_counter_info(
|
||||
true, tim::hardware_counters::api::rocm, events_info.size(), 0,
|
||||
_sym, _pysym, _short_desc, _long_desc, _units,
|
||||
qualifier_vec_t{ _device_qualifier }));
|
||||
}
|
||||
else
|
||||
{
|
||||
auto _dim_info = std::vector<std::string>{};
|
||||
|
||||
for(const auto& itr : ditr.dimension_info)
|
||||
{
|
||||
auto _info = (itr.instance_size > 1)
|
||||
? JOIN("", itr.name, "[", 0, ":",
|
||||
itr.instance_size - 1, "]")
|
||||
: std::string{};
|
||||
if(!_info.empty()) _dim_info.emplace_back(_info);
|
||||
}
|
||||
|
||||
auto _sym = JOIN("", ditr.name, _device_qualifier_sym);
|
||||
auto _short_desc = JOIN("", ditr.name, " on device ", _dev_index);
|
||||
if(!_dim_info.empty())
|
||||
{
|
||||
namespace join = ::timemory::join;
|
||||
_short_desc += JOIN(
|
||||
"", ". ",
|
||||
join::join(join::array_config{ ", ", "", "" }, _dim_info));
|
||||
}
|
||||
events_info.emplace_back(hardware_counter_info(
|
||||
true, tim::hardware_counters::api::rocm, events_info.size(), 0,
|
||||
_sym, _pysym, _short_desc, _long_desc, _units,
|
||||
qualifier_vec_t{ _device_qualifier }));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(std::exception& _e)
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(1, "Constructing ROCm event info failed: %s\n", _e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
client_data::set_agents(agent_vec_t&& _agents_v)
|
||||
{
|
||||
agents = std::move(_agents_v);
|
||||
|
||||
std::sort(agents.begin(), agents.end(),
|
||||
[](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; });
|
||||
|
||||
cpu_agents.clear();
|
||||
gpu_agents.clear();
|
||||
|
||||
for(const auto& itr : agents)
|
||||
{
|
||||
if(itr.type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
cpu_agents.emplace_back(tool_agent{ cpu_agents.size(), &itr });
|
||||
else if(itr.type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
gpu_agents.emplace_back(tool_agent{ gpu_agents.size(), &itr });
|
||||
}
|
||||
}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
@@ -0,0 +1,252 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/synchronized.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
using hardware_counter_info = ::tim::hardware_counters::info;
|
||||
|
||||
using kernel_symbol_data_t =
|
||||
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
using kernel_symbol_map_t =
|
||||
std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data_t>;
|
||||
using callback_arg_array_t = std::vector<std::pair<std::string, std::string>>;
|
||||
|
||||
struct code_object_callback_record_t
|
||||
{
|
||||
uint64_t timestamp = 0;
|
||||
rocprofiler_callback_tracing_record_t record = {};
|
||||
rocprofiler_callback_tracing_code_object_load_data_t payload = {};
|
||||
};
|
||||
|
||||
struct kernel_symbol_callback_record_t
|
||||
{
|
||||
uint64_t timestamp = 0;
|
||||
rocprofiler_callback_tracing_record_t record = {};
|
||||
kernel_symbol_data_t payload = {};
|
||||
};
|
||||
|
||||
struct rocprofiler_tool_counter_info_t : rocprofiler_counter_info_v0_t
|
||||
{
|
||||
using this_type = rocprofiler_tool_counter_info_t;
|
||||
using parent_type = rocprofiler_counter_info_v0_t;
|
||||
using dimension_info_vec_t = std::vector<rocprofiler_record_dimension_info_t>;
|
||||
|
||||
rocprofiler_tool_counter_info_t(rocprofiler_agent_id_t _agent_id, parent_type _info,
|
||||
dimension_info_vec_t&& _dim_info);
|
||||
|
||||
rocprofiler_tool_counter_info_t() = default;
|
||||
~rocprofiler_tool_counter_info_t() = default;
|
||||
rocprofiler_tool_counter_info_t(const rocprofiler_tool_counter_info_t&) = default;
|
||||
rocprofiler_tool_counter_info_t(rocprofiler_tool_counter_info_t&&) noexcept = default;
|
||||
rocprofiler_tool_counter_info_t& operator=(const rocprofiler_tool_counter_info_t&) =
|
||||
default;
|
||||
rocprofiler_tool_counter_info_t& operator =(
|
||||
rocprofiler_tool_counter_info_t&&) noexcept = default;
|
||||
|
||||
rocprofiler_agent_id_t agent_id = {};
|
||||
std::vector<rocprofiler_record_dimension_info_t> dimension_info = {};
|
||||
};
|
||||
|
||||
struct tool_agent
|
||||
{
|
||||
uint64_t device_id = 0;
|
||||
const rocprofiler_agent_v0_t* agent = nullptr;
|
||||
};
|
||||
|
||||
struct timing_interval
|
||||
{
|
||||
rocprofiler_timestamp_t start = 0;
|
||||
rocprofiler_timestamp_t end = 0;
|
||||
};
|
||||
|
||||
using agent_counter_info_map_t =
|
||||
std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::vector<rocprofiler_tool_counter_info_t>>;
|
||||
|
||||
using agent_counter_profile_map_t =
|
||||
std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::optional<rocprofiler_profile_config_id_t>>;
|
||||
|
||||
using counter_id_vec_t = std::vector<rocprofiler_counter_id_t>;
|
||||
|
||||
using agent_counter_id_map_t =
|
||||
std::unordered_map<rocprofiler_agent_id_t, counter_id_vec_t>;
|
||||
|
||||
using backtrace_operation_map_t =
|
||||
std::unordered_map<rocprofiler_callback_tracing_kind_t,
|
||||
std::unordered_set<rocprofiler_tracing_operation_t>>;
|
||||
|
||||
struct client_data
|
||||
{
|
||||
static constexpr size_t num_buffers = 3;
|
||||
static constexpr size_t num_contexts = 2;
|
||||
|
||||
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
|
||||
using callback_name_info_t = rocprofiler::sdk::callback_name_info_t<std::string_view>;
|
||||
using kernel_symbol_vec_t = std::vector<kernel_symbol_callback_record_t*>;
|
||||
using code_object_vec_t = std::vector<code_object_callback_record_t>;
|
||||
using buffer_id_vec_t = std::array<rocprofiler_buffer_id_t, num_buffers>;
|
||||
using context_id_vec_t = std::array<rocprofiler_context_id_t, num_contexts>;
|
||||
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
|
||||
|
||||
rocprofiler_client_id_t* client_id = nullptr;
|
||||
rocprofiler_client_finalize_t client_fini = nullptr;
|
||||
rocprofiler_context_id_t primary_ctx = { 0 };
|
||||
rocprofiler_context_id_t counter_ctx = { 0 };
|
||||
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
|
||||
std::vector<rocprofiler_agent_v0_t> agents = {};
|
||||
std::vector<tool_agent> cpu_agents = {};
|
||||
std::vector<tool_agent> gpu_agents = {};
|
||||
std::vector<hardware_counter_info> events_info = {};
|
||||
agent_counter_id_map_t agent_events = {};
|
||||
agent_counter_info_map_t agent_counter_info = {};
|
||||
agent_counter_profile_map_t agent_counter_profiles = {};
|
||||
common::synchronized<code_object_vec_t> code_object_records = {};
|
||||
common::synchronized<kernel_symbol_vec_t> kernel_symbol_records = {};
|
||||
buffer_name_info_t buffered_tracing_info = {};
|
||||
callback_name_info_t callback_tracing_info = {};
|
||||
backtrace_operation_map_t backtrace_operations = {};
|
||||
|
||||
void initialize();
|
||||
void initialize_event_info();
|
||||
void set_agents(agent_vec_t&& agents);
|
||||
context_id_vec_t get_contexts() const;
|
||||
buffer_id_vec_t get_buffers() const;
|
||||
const rocprofiler_agent_t* get_agent(rocprofiler_agent_id_t _id) const;
|
||||
const tool_agent* get_gpu_tool_agent(rocprofiler_agent_id_t id) const;
|
||||
const kernel_symbol_data_t* get_kernel_symbol_info(uint64_t _kernel_id) const;
|
||||
const rocprofiler_tool_counter_info_t* get_tool_counter_info(
|
||||
rocprofiler_agent_id_t _agent_id, rocprofiler_counter_id_t _counter_id) const;
|
||||
};
|
||||
|
||||
inline client_data::context_id_vec_t
|
||||
client_data::get_contexts() const
|
||||
{
|
||||
return context_id_vec_t{
|
||||
primary_ctx,
|
||||
counter_ctx,
|
||||
};
|
||||
}
|
||||
|
||||
inline client_data::buffer_id_vec_t
|
||||
client_data::get_buffers() const
|
||||
{
|
||||
return buffer_id_vec_t{
|
||||
kernel_dispatch_buffer,
|
||||
memory_copy_buffer,
|
||||
counter_collection_buffer,
|
||||
};
|
||||
}
|
||||
|
||||
inline const rocprofiler_agent_t*
|
||||
client_data::get_agent(rocprofiler_agent_id_t _id) const
|
||||
{
|
||||
for(const auto& itr : agents)
|
||||
if(itr.id == _id) return &itr;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline const tool_agent*
|
||||
client_data::get_gpu_tool_agent(rocprofiler_agent_id_t id) const
|
||||
{
|
||||
for(const auto& itr : gpu_agents)
|
||||
if(id == itr.agent->id) return &itr;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline const kernel_symbol_data_t*
|
||||
client_data::get_kernel_symbol_info(uint64_t _kernel_id) const
|
||||
{
|
||||
return kernel_symbol_records.rlock(
|
||||
[_kernel_id](const auto& _data) -> const kernel_symbol_data_t* {
|
||||
for(const auto& itr : _data)
|
||||
{
|
||||
if(_kernel_id == itr->payload.kernel_id)
|
||||
{
|
||||
return &itr->payload;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
});
|
||||
}
|
||||
|
||||
inline const rocprofiler_tool_counter_info_t*
|
||||
client_data::get_tool_counter_info(rocprofiler_agent_id_t _agent_id,
|
||||
rocprofiler_counter_id_t _counter_id) const
|
||||
{
|
||||
for(const auto& itr : agent_counter_info.at(_agent_id))
|
||||
{
|
||||
if(itr.id == _counter_id) return &itr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline constexpr client_data*
|
||||
as_client_data(void* _ptr)
|
||||
{
|
||||
return static_cast<client_data*>(_ptr);
|
||||
}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
|
||||
#if !defined(ROCPROFILER_CALL)
|
||||
# define ROCPROFILER_CALL(result) \
|
||||
{ \
|
||||
rocprofiler_status_t ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) = \
|
||||
(result); \
|
||||
if(ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) != \
|
||||
ROCPROFILER_STATUS_SUCCESS) \
|
||||
{ \
|
||||
auto msg = std::stringstream{}; \
|
||||
std::string status_msg = rocprofiler_get_status_string( \
|
||||
ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__)); \
|
||||
msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \
|
||||
<< "rocprofiler-sdk call [" << #result \
|
||||
<< "] failed with error code " \
|
||||
<< ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) \
|
||||
<< " :: " << status_msg; \
|
||||
ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
@@ -1,834 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "core/common.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/gpu.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
#include "library/rocm.hpp"
|
||||
#include "library/rocm/hsa_rsrc_factory.hpp"
|
||||
|
||||
#include <timemory/backends/hardware_counters.hpp>
|
||||
#include <timemory/manager.hpp>
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/storage/types.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <rocprofiler.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdlib>
|
||||
#include <dlfcn.h>
|
||||
#include <hsa.h>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <string.h>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using ::rocprofiler::util::AgentInfo;
|
||||
using ::rocprofiler::util::HsaRsrcFactory;
|
||||
|
||||
auto&
|
||||
get_event_names()
|
||||
{
|
||||
static auto _v = std::map<uint32_t, std::vector<rocprofiler_feature_t>>{};
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Error handler
|
||||
void
|
||||
fatal(const std::string& msg)
|
||||
{
|
||||
ROCPROFSYS_PRINT_F("\n");
|
||||
ROCPROFSYS_PRINT_F("%s\n", msg.c_str());
|
||||
abort();
|
||||
}
|
||||
|
||||
// Check returned HSA API status
|
||||
const char*
|
||||
rocm_error_string(hsa_status_t _status)
|
||||
{
|
||||
const char* _err_string = nullptr;
|
||||
if(_status != HSA_STATUS_SUCCESS) rocprofiler_error_string(&_err_string);
|
||||
return _err_string;
|
||||
}
|
||||
|
||||
// Check returned HSA API status
|
||||
bool
|
||||
rocm_check_status(hsa_status_t _status, const std::set<hsa_status_t>& _nonfatal = {})
|
||||
{
|
||||
if(_status != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
if(_nonfatal.count(_status) == 0)
|
||||
fatal(JOIN(" :: ", "ERROR", rocm_error_string(_status)));
|
||||
|
||||
ROCPROFSYS_PRINT_F("Warning! %s\n", rocm_error_string(_status));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Context stored entry type
|
||||
struct context_entry_t
|
||||
{
|
||||
bool valid;
|
||||
hsa_agent_t agent;
|
||||
rocprofiler_group_t group;
|
||||
rocprofiler_callback_data_t data;
|
||||
};
|
||||
|
||||
// Context callback arg
|
||||
struct callbacks_arg_t
|
||||
{
|
||||
rocprofiler_pool_t** pools;
|
||||
};
|
||||
|
||||
// Handler callback arg
|
||||
struct handler_arg_t
|
||||
{
|
||||
rocprofiler_feature_t* features;
|
||||
unsigned feature_count;
|
||||
};
|
||||
|
||||
bool&
|
||||
is_setup()
|
||||
{
|
||||
static bool _v = false;
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::map<uint32_t, std::vector<std::string_view>>
|
||||
get_data_labels()
|
||||
{
|
||||
auto _v = std::map<uint32_t, std::vector<std::string_view>>{};
|
||||
for(const auto& itr : get_event_names())
|
||||
{
|
||||
_v[itr.first] = {};
|
||||
for(auto vitr : itr.second)
|
||||
_v[itr.first].emplace_back(std::string_view{ vitr.name });
|
||||
}
|
||||
return _v;
|
||||
}
|
||||
|
||||
// Dump stored context entry
|
||||
void
|
||||
rocm_dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features,
|
||||
unsigned feature_count)
|
||||
{
|
||||
volatile std::atomic<bool>* valid =
|
||||
reinterpret_cast<std::atomic<bool>*>(&entry->valid);
|
||||
while(valid->load() == false)
|
||||
sched_yield();
|
||||
|
||||
const rocprofiler_dispatch_record_t* record = entry->data.record;
|
||||
|
||||
if(!record) return; // there is nothing to do here.
|
||||
|
||||
auto _queue_id = entry->data.queue_id;
|
||||
auto _thread_id = entry->data.thread_id;
|
||||
auto _dev_id = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index;
|
||||
auto _kernel_name = std::string{ entry->data.kernel_name };
|
||||
auto _pos = _kernel_name.find_last_of(')');
|
||||
if(_pos != std::string::npos) _kernel_name = _kernel_name.substr(0, _pos + 1);
|
||||
|
||||
rocprofiler_group_t& group = entry->group;
|
||||
if(group.context == nullptr)
|
||||
{
|
||||
fatal("context is nullptr\n");
|
||||
}
|
||||
|
||||
if(feature_count > 0)
|
||||
{
|
||||
rocm_check_status(rocprofiler_group_get_data(&group));
|
||||
rocm_check_status(rocprofiler_get_metrics(group.context));
|
||||
}
|
||||
|
||||
auto _evt =
|
||||
component::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name,
|
||||
record->begin, record->end, feature_count, features };
|
||||
|
||||
component::rocm_data()->emplace_back(_evt);
|
||||
}
|
||||
|
||||
// Profiling completion handler
|
||||
// Dump and delete the context entry
|
||||
// Return true if the context was dumped successfully
|
||||
bool
|
||||
rocm_context_handler(const rocprofiler_pool_entry_t* entry, void* arg)
|
||||
{
|
||||
// Context entry
|
||||
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
|
||||
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
|
||||
|
||||
// rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
|
||||
// if(!_lk.owns_lock()) _lk.lock();
|
||||
|
||||
rocm_dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Kernel disoatch callback
|
||||
hsa_status_t
|
||||
rocm_dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg,
|
||||
rocprofiler_group_t* group)
|
||||
{
|
||||
// Passed tool data
|
||||
hsa_agent_t agent = callback_data->agent;
|
||||
|
||||
// Open profiling context
|
||||
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
|
||||
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(arg);
|
||||
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
|
||||
rocprofiler_pool_entry_t pool_entry{};
|
||||
rocm_check_status(rocprofiler_pool_fetch(pool, &pool_entry));
|
||||
// Profiling context entry
|
||||
rocprofiler_t* context = pool_entry.context;
|
||||
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
|
||||
|
||||
// Get group[0]
|
||||
rocm_check_status(rocprofiler_get_group(context, 0, group));
|
||||
|
||||
// Fill profiling context entry
|
||||
entry->agent = agent;
|
||||
entry->group = *group;
|
||||
entry->data = *callback_data;
|
||||
entry->data.kernel_name = strdup(callback_data->kernel_name);
|
||||
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
unsigned
|
||||
metrics_input(unsigned _device, rocprofiler_feature_t** ret)
|
||||
{
|
||||
// Profiling feature objects
|
||||
auto _events = tim::delimit(config::get_rocm_events(), ", ;\t\n");
|
||||
std::vector<std::string> _features = {};
|
||||
auto _this_device = JOIN("", ":device=", _device);
|
||||
for(auto itr : _events)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(3, "Processing feature '%s' for device %u...\n", itr.c_str(),
|
||||
_device);
|
||||
auto _pos = itr.find(":device=");
|
||||
if(_pos != std::string::npos)
|
||||
{
|
||||
if(itr.find(_this_device) != std::string::npos)
|
||||
{
|
||||
_features.emplace_back(itr.substr(0, _pos));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_features.emplace_back(itr);
|
||||
}
|
||||
}
|
||||
const unsigned feature_count = _features.size();
|
||||
rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count];
|
||||
memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
|
||||
|
||||
// PMC events
|
||||
for(unsigned i = 0; i < feature_count; ++i)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(3, "Adding feature '%s' for device %u...\n",
|
||||
_features.at(i).c_str(), _device);
|
||||
features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC;
|
||||
features[i].name = strdup(_features.at(i).c_str());
|
||||
features[i].parameters = nullptr;
|
||||
features[i].parameter_count = 0;
|
||||
}
|
||||
|
||||
*ret = features;
|
||||
return feature_count;
|
||||
}
|
||||
|
||||
using info_data = std::vector<component::rocm_info_entry>;
|
||||
|
||||
hsa_status_t
|
||||
info_data_callback(const rocprofiler_info_data_t info, void* arg)
|
||||
{
|
||||
using qualifier_t = tim::hardware_counters::qualifier;
|
||||
using qualifier_vec_t = std::vector<qualifier_t>;
|
||||
auto* _data = static_cast<info_data*>(arg);
|
||||
auto _dev_index = info.agent_index;
|
||||
|
||||
switch(info.kind)
|
||||
{
|
||||
case ROCPROFILER_INFO_KIND_METRIC:
|
||||
{
|
||||
auto _device_qualifier_sym = JOIN("", ":device=", _dev_index);
|
||||
auto _device_qualifier =
|
||||
tim::hardware_counters::qualifier{ true, static_cast<int>(_dev_index),
|
||||
_device_qualifier_sym,
|
||||
JOIN(" ", "Device", _dev_index) };
|
||||
auto _long_desc = std::string{ info.metric.description };
|
||||
auto _units = std::string{};
|
||||
auto _pysym = std::string{};
|
||||
if(info.metric.expr != nullptr)
|
||||
{
|
||||
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
|
||||
auto _short_desc = JOIN("", "Derived counter: ", info.metric.expr);
|
||||
_data->emplace_back(component::rocm_info_entry(
|
||||
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
|
||||
_pysym, _short_desc, _long_desc, _units,
|
||||
qualifier_vec_t{ _device_qualifier }));
|
||||
}
|
||||
else
|
||||
{
|
||||
if(info.metric.instances == 1)
|
||||
{
|
||||
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
|
||||
auto _short_desc =
|
||||
JOIN("", info.metric.name, " on device ", _dev_index);
|
||||
_data->emplace_back(component::rocm_info_entry(
|
||||
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
|
||||
_pysym, _short_desc, _long_desc, _units,
|
||||
qualifier_vec_t{ _device_qualifier }));
|
||||
}
|
||||
else
|
||||
{
|
||||
for(uint32_t i = 0; i < info.metric.instances; ++i)
|
||||
{
|
||||
auto _instance_qualifier_sym = JOIN("", '[', i, ']');
|
||||
auto _instance_qualifier =
|
||||
tim::hardware_counters::qualifier{ true, static_cast<int>(i),
|
||||
_instance_qualifier_sym,
|
||||
JOIN(" ", "Instance", i) };
|
||||
auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym,
|
||||
_device_qualifier_sym);
|
||||
auto _short_desc = JOIN("", info.metric.name, " instance ", i,
|
||||
" on device ", _dev_index);
|
||||
_data->emplace_back(component::rocm_info_entry(
|
||||
true, tim::hardware_counters::api::rocm, _data->size(), 0,
|
||||
_sym, _pysym, _short_desc, _long_desc, _units,
|
||||
qualifier_vec_t{ _device_qualifier, _instance_qualifier }));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: printf("wrong info kind %u\n", info.kind); return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<component::rocm_info_entry>
|
||||
rocm_metrics()
|
||||
{
|
||||
std::vector<component::rocm_info_entry> _data = {};
|
||||
try
|
||||
{
|
||||
(void) HsaRsrcFactory::Instance();
|
||||
} catch(std::runtime_error& _e)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(0, "%s\n", _e.what());
|
||||
return _data;
|
||||
}
|
||||
|
||||
// Available GPU agents
|
||||
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
|
||||
|
||||
std::vector<AgentInfo*> _gpu_agents(gpu_count, nullptr);
|
||||
for(unsigned i = 0; i < gpu_count; ++i)
|
||||
{
|
||||
const AgentInfo* _agent = _gpu_agents[i];
|
||||
const AgentInfo** _agent_p = &_agent;
|
||||
HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p);
|
||||
|
||||
if(!rocm_check_status(rocprofiler_iterate_info(
|
||||
&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC,
|
||||
info_data_callback, reinterpret_cast<void*>(&_data)),
|
||||
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(-1, "rocprofiler_iterate_info failed for gpu agent %u\n",
|
||||
i);
|
||||
}
|
||||
}
|
||||
|
||||
if(gpu_count > 0 && _data.empty())
|
||||
{
|
||||
if(!rocm_check_status(rocprofiler_iterate_info(
|
||||
nullptr, ROCPROFILER_INFO_KIND_METRIC,
|
||||
info_data_callback, reinterpret_cast<void*>(&_data)),
|
||||
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(
|
||||
-1, "rocprofiler_iterate_info failed for %i gpu agents\n", gpu_count);
|
||||
}
|
||||
}
|
||||
|
||||
auto _settings = tim::settings::shared_instance();
|
||||
if(_settings)
|
||||
{
|
||||
auto ritr = _settings->find("ROCPROFSYS_ROCM_EVENTS");
|
||||
if(ritr != _settings->end())
|
||||
{
|
||||
auto _rocm_events = ritr->second;
|
||||
if(_rocm_events->get_choices().empty())
|
||||
{
|
||||
std::vector<std::string> _choices = {};
|
||||
_choices.reserve(_data.size());
|
||||
for(auto itr : _data)
|
||||
{
|
||||
if(!itr.symbol().empty()) _choices.emplace_back(itr.symbol());
|
||||
}
|
||||
_rocm_events->set_choices(_choices);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _data;
|
||||
}
|
||||
|
||||
void
|
||||
rocm_initialize()
|
||||
{
|
||||
// Available GPU agents
|
||||
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
|
||||
|
||||
(void) rocm_metrics();
|
||||
|
||||
// Adding dispatch observer
|
||||
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
|
||||
callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count];
|
||||
for(unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++)
|
||||
{
|
||||
// Getting profiling features
|
||||
rocprofiler_feature_t* features = nullptr;
|
||||
unsigned feature_count = metrics_input(gpu_id, &features);
|
||||
|
||||
if(features)
|
||||
{
|
||||
get_event_names()[gpu_id].clear();
|
||||
get_event_names()[gpu_id].reserve(feature_count);
|
||||
for(unsigned i = 0; i < feature_count; ++i)
|
||||
get_event_names().at(gpu_id).emplace_back(features[i]);
|
||||
}
|
||||
|
||||
// Handler arg
|
||||
handler_arg_t* handler_arg = new handler_arg_t{};
|
||||
handler_arg->features = features;
|
||||
handler_arg->feature_count = feature_count;
|
||||
|
||||
// Context properties
|
||||
rocprofiler_pool_properties_t properties{};
|
||||
properties.num_entries = 100;
|
||||
properties.payload_bytes = sizeof(context_entry_t);
|
||||
properties.handler = rocm_context_handler;
|
||||
properties.handler_arg = handler_arg;
|
||||
|
||||
// Getting GPU device info
|
||||
const AgentInfo* agent_info = nullptr;
|
||||
if(HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false)
|
||||
{
|
||||
fprintf(stderr, "GetGpuAgentInfo failed\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
// Open profiling pool
|
||||
rocprofiler_pool_t* pool = nullptr;
|
||||
uint32_t mode = 0; // ROCPROFILER_MODE_SINGLEGROUP
|
||||
rocm_check_status(rocprofiler_pool_open(agent_info->dev_id, features,
|
||||
feature_count, &pool, mode, &properties));
|
||||
callbacks_arg->pools[gpu_id] = pool;
|
||||
}
|
||||
|
||||
rocprofiler_queue_callbacks_t callbacks_ptrs{};
|
||||
callbacks_ptrs.dispatch = rocm_dispatch_callback;
|
||||
int err = rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
|
||||
ROCPROFSYS_VERBOSE_F(3, "err=%d, rocprofiler_set_queue_callbacks\n", err);
|
||||
|
||||
is_setup() = true;
|
||||
}
|
||||
|
||||
void
|
||||
rocm_cleanup()
|
||||
{
|
||||
// Unregister dispatch callback
|
||||
rocm_check_status(rocprofiler_remove_queue_callbacks());
|
||||
// close profiling pool
|
||||
// rocm_check_status(rocprofiler_pool_flush(pool));
|
||||
// rocm_check_status(rocprofiler_pool_close(pool));
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using rocm_event = component::rocm_event;
|
||||
using rocm_data_t = component::rocm_data_t;
|
||||
using rocm_metric_type = component::rocm_metric_type;
|
||||
using rocm_feature_value = component::rocm_feature_value;
|
||||
using rocm_data_tracker = component::rocm_data_tracker;
|
||||
|
||||
void
|
||||
post_process_perfetto()
|
||||
{
|
||||
using counter_track = perfetto_counter_track<rocm_event>;
|
||||
|
||||
static bool _once = false;
|
||||
if(_once) return;
|
||||
|
||||
auto _data = rocm_data_t{};
|
||||
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
|
||||
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
|
||||
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
||||
|
||||
for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i)
|
||||
{
|
||||
auto& _v = component::rocm_data(i);
|
||||
if(_v)
|
||||
{
|
||||
_data.reserve(_data.size() + _v->size());
|
||||
for(auto& itr : *_v)
|
||||
_data.emplace_back(itr);
|
||||
}
|
||||
}
|
||||
|
||||
if(_data.empty()) return;
|
||||
_once = true;
|
||||
|
||||
std::sort(_data.begin(), _data.end());
|
||||
|
||||
auto _get_events = [](std::vector<rocm_event*>& _inp, rocm_metric_type _ts) {
|
||||
auto _v = std::vector<rocm_event*>{};
|
||||
for(const auto& itr : _inp)
|
||||
{
|
||||
if(_ts >= itr->entry && _ts <= itr->exit) _v.emplace_back(itr);
|
||||
if(_ts > itr->exit) break;
|
||||
}
|
||||
return _v;
|
||||
};
|
||||
|
||||
{
|
||||
auto _device_time = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
||||
for(auto& itr : _data)
|
||||
{
|
||||
_device_data[itr.device_id].emplace_back(&itr);
|
||||
_device_time[itr.device_id].emplace(itr.entry);
|
||||
_device_time[itr.device_id].emplace(itr.exit);
|
||||
auto _dev_id = itr.device_id;
|
||||
if(get_use_perfetto() && !counter_track::exists(_dev_id))
|
||||
{
|
||||
auto addendum = [&](auto&& _v) {
|
||||
return JOIN(" ", "Device", _v, JOIN("", '[', _dev_id, ']'));
|
||||
};
|
||||
for(auto nitr : itr.feature_names)
|
||||
{
|
||||
auto _name = get_data_labels().at(itr.device_id).at(nitr);
|
||||
counter_track::emplace(_dev_id, addendum(_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& ditr : _device_time)
|
||||
{
|
||||
for(auto itr = ditr.second.begin(); itr != ditr.second.end(); ++itr)
|
||||
{
|
||||
auto _next = std::next(itr);
|
||||
if(_next == ditr.second.end()) continue;
|
||||
_device_range[ditr.first].emplace(((*_next / 2) + (*itr / 2)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& ditr : _device_range)
|
||||
{
|
||||
auto _dev_id = ditr.first;
|
||||
auto _values = std::vector<rocm_feature_value>{};
|
||||
auto _ts_sorted_data = _device_data[_dev_id];
|
||||
std::sort(_ts_sorted_data.begin(), _ts_sorted_data.end(),
|
||||
[](auto* _l, auto* _r) { return _l->exit < _r->exit; });
|
||||
for(const auto& itr : ditr.second)
|
||||
{
|
||||
auto _v = _get_events(_ts_sorted_data, itr);
|
||||
uint64_t _ts = itr;
|
||||
for(auto* vitr : _v)
|
||||
{
|
||||
size_t _n = vitr->feature_values.size();
|
||||
if(_values.empty())
|
||||
{
|
||||
_values.reserve(_n);
|
||||
for(size_t i = 0; i < _n; ++i)
|
||||
{
|
||||
_values.emplace_back(vitr->feature_values.at(i));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(size_t i = 0; i < _n; ++i)
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
#endif
|
||||
auto _plus = [](auto& _lhs, auto&& _rhs) { _lhs += _rhs; };
|
||||
std::visit(_plus, _values.at(i), vitr->feature_values.at(i));
|
||||
#ifdef __GNUC__
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < _values.size(); ++i)
|
||||
{
|
||||
auto _trace_counter = [_dev_id, i, _ts](auto&& _val) {
|
||||
TRACE_COUNTER("kernel_hardware_counter",
|
||||
counter_track::at(_dev_id, i), _ts, _val);
|
||||
};
|
||||
std::visit(_trace_counter, _values.at(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
post_process_timemory()
|
||||
{
|
||||
static bool _once = false;
|
||||
if(_once) return;
|
||||
|
||||
auto _data = rocm_data_t{};
|
||||
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
|
||||
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
|
||||
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
||||
|
||||
for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i)
|
||||
{
|
||||
auto& _v = component::rocm_data(i);
|
||||
if(_v)
|
||||
{
|
||||
_data.reserve(_data.size() + _v->size());
|
||||
for(auto& itr : *_v)
|
||||
_data.emplace_back(itr);
|
||||
}
|
||||
}
|
||||
|
||||
if(_data.empty()) return;
|
||||
_once = true;
|
||||
|
||||
std::sort(_data.begin(), _data.end());
|
||||
|
||||
for(auto& itr : _data)
|
||||
{
|
||||
_device_data[itr.device_id].emplace_back(&itr);
|
||||
}
|
||||
|
||||
for(auto& itr : _device_data)
|
||||
{
|
||||
// sort according to when it exited
|
||||
std::sort(itr.second.begin(), itr.second.end(),
|
||||
[](auto* _lhs, auto* _rhs) { return _lhs->exit < _rhs->exit; });
|
||||
}
|
||||
|
||||
using storage_type = typename rocm_data_tracker::storage_type;
|
||||
using bundle_type = tim::lightweight_tuple<rocm_data_tracker>;
|
||||
|
||||
auto _info = rocm_metrics();
|
||||
static auto _get_description = [&_info](std::string_view _v) {
|
||||
for(auto& itr : _info)
|
||||
{
|
||||
if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0)
|
||||
{
|
||||
return itr.long_description();
|
||||
}
|
||||
}
|
||||
return std::string{};
|
||||
};
|
||||
|
||||
struct local_event
|
||||
{
|
||||
rocm_event* parent = nullptr;
|
||||
mutable std::vector<local_event> children = {};
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(local_event)
|
||||
|
||||
explicit local_event(rocm_event* _v)
|
||||
: parent{ _v }
|
||||
{}
|
||||
|
||||
bool operator()(rocm_event* _v)
|
||||
{
|
||||
if(!parent) return false;
|
||||
if(_v->device_id != parent->device_id) return false;
|
||||
if(_v->entry > parent->entry && _v->exit <= parent->exit)
|
||||
{
|
||||
children.emplace_back(_v);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool operator<(const local_event& _v) const
|
||||
{
|
||||
if(!parent && _v.parent) return true;
|
||||
if(parent && !_v.parent) return false;
|
||||
return *parent < *_v.parent;
|
||||
}
|
||||
|
||||
void operator()(int64_t _index, scope::config _scope) const
|
||||
{
|
||||
if(!parent) return;
|
||||
bundle_type _bundle{ parent->name, _scope };
|
||||
_bundle.push(parent->queue_id)
|
||||
.start()
|
||||
.store(parent->feature_values.at(_index));
|
||||
|
||||
std::sort(children.begin(), children.end());
|
||||
for(const auto& itr : children)
|
||||
itr(_index, _scope);
|
||||
|
||||
_bundle.stop().pop(parent->queue_id);
|
||||
}
|
||||
};
|
||||
|
||||
struct local_storage
|
||||
{
|
||||
int64_t index = 0;
|
||||
std::string metric_name = {};
|
||||
std::string metric_description = {};
|
||||
std::unique_ptr<storage_type> storage = {};
|
||||
|
||||
local_storage(uint32_t _devid, size_t _idx, std::string_view _name)
|
||||
: index{ static_cast<int64_t>(_idx) }
|
||||
, metric_name{ _name }
|
||||
, metric_description{ _get_description(metric_name) }
|
||||
{
|
||||
auto _metric_name = std::string{ _name };
|
||||
_metric_name = std::regex_replace(
|
||||
_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2");
|
||||
storage = std::make_unique<storage_type>(
|
||||
tim::standalone_storage{}, index,
|
||||
JOIN('-', "rocprof", "device", _devid, _metric_name));
|
||||
}
|
||||
|
||||
void operator()(const local_event& _event, scope::config _scope) const
|
||||
{
|
||||
operation::set_storage<rocm_data_tracker>{}(storage.get());
|
||||
_event(index, _scope);
|
||||
}
|
||||
|
||||
void write() const
|
||||
{
|
||||
rocm_data_tracker::label() = metric_name;
|
||||
rocm_data_tracker::description() = metric_description;
|
||||
storage->write();
|
||||
}
|
||||
};
|
||||
|
||||
auto _local_data = std::map<uint32_t, std::vector<local_event>>{};
|
||||
auto _scope = scope::get_default();
|
||||
|
||||
for(auto& ditr : _device_data)
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Post-processing %zu entries for device %u...\n",
|
||||
ditr.second.size(), ditr.first);
|
||||
auto _storage = std::vector<local_storage>{};
|
||||
for(auto& itr : ditr.second)
|
||||
{
|
||||
auto _n = itr->feature_names.size();
|
||||
if(_n > _storage.size())
|
||||
{
|
||||
_storage.reserve(_n);
|
||||
for(size_t i = _storage.size(); i < _n; ++i)
|
||||
_storage.emplace_back(
|
||||
ditr.first, i,
|
||||
get_data_labels().at(ditr.first).at(itr->feature_names.at(i)));
|
||||
}
|
||||
}
|
||||
|
||||
auto& _local = _local_data[ditr.first];
|
||||
_local.reserve(ditr.second.size());
|
||||
double _avg = 0.0;
|
||||
for(auto& itr : ditr.second)
|
||||
{
|
||||
if(_local.empty() || itr->entry >= _local.back().parent->exit)
|
||||
{
|
||||
_local.emplace_back(itr);
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t _n = 0;
|
||||
bool _found = false;
|
||||
for(auto litr = _local.rbegin(); litr != _local.rend(); ++litr)
|
||||
{
|
||||
++_n;
|
||||
if((*litr)(itr))
|
||||
{
|
||||
_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!_found) _local.emplace_back(itr);
|
||||
_avg += _n;
|
||||
}
|
||||
}
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(3, "Average # of iterations before match: %.1f\n",
|
||||
_avg / ditr.second.size() * 100.0);
|
||||
|
||||
for(auto& sitr : _storage)
|
||||
{
|
||||
for(auto& itr : _local)
|
||||
sitr(itr, _scope);
|
||||
}
|
||||
|
||||
for(auto& itr : _storage)
|
||||
itr.write();
|
||||
}
|
||||
|
||||
tim::trait::runtime_enabled<rocprofsys::rocprofiler::rocm_data_tracker>::set(false);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
post_process()
|
||||
{
|
||||
if(get_use_perfetto()) post_process_perfetto();
|
||||
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto _manager = tim::manager::master_instance();
|
||||
if(_manager)
|
||||
{
|
||||
_manager->add_cleanup("rocprofiler", &post_process_timemory);
|
||||
}
|
||||
else
|
||||
{
|
||||
post_process_timemory();
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace rocprofiler
|
||||
} // namespace rocprofsys
|
||||
@@ -1,967 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/roctracer.hpp"
|
||||
#include "binary/analysis.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/concepts.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/locking.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
#include "library/tracing.hpp"
|
||||
|
||||
#include <timemory/backends/cpu.hpp>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/hash/types.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <tuple>
|
||||
|
||||
#include <roctracer_ext.h>
|
||||
#include <roctracer_hip.h>
|
||||
#include <roctracer_roctx.h>
|
||||
|
||||
#if ROCPROFSYS_HIP_VERSION < 50300
|
||||
# include <roctracer_hcc.h>
|
||||
#endif
|
||||
|
||||
#define AMD_INTERNAL_BUILD 1
|
||||
#include <roctracer_hsa.h>
|
||||
|
||||
#if __has_include(<hip/amd_detail/hip_prof_str.h>) || (defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0)
|
||||
# include <hip/amd_detail/hip_prof_str.h>
|
||||
# define ROCPROFSYS_HIP_API_ARGS 1
|
||||
#else
|
||||
# define ROCPROFSYS_HIP_API_ARGS 0
|
||||
#endif
|
||||
|
||||
TIMEMORY_DEFINE_API(roctracer)
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace
|
||||
{
|
||||
template <typename Tp>
|
||||
auto&
|
||||
roctracer_type_mutex()
|
||||
{
|
||||
return tim::type_mutex<Tp, category::roctracer, max_supported_threads,
|
||||
locking::atomic_mutex>();
|
||||
}
|
||||
|
||||
std::string
|
||||
hip_api_string(hip_api_id_t id, const hip_api_data_t* data)
|
||||
{
|
||||
#if ROCPROFSYS_HIP_API_ARGS > 0
|
||||
std::string _v = hipApiString(id, data);
|
||||
if(_v.empty()) return _v;
|
||||
auto _pbeg = _v.find('(');
|
||||
if(_pbeg == std::string::npos) return _v;
|
||||
auto _pend = _v.find_last_of(')');
|
||||
if(_pend == std::string::npos || _pbeg >= _pend) return _v;
|
||||
auto _n = (_pend - _pbeg - 1);
|
||||
return _v.substr(_pbeg + 1, _n);
|
||||
#else
|
||||
tim::consume_parameters(id, data);
|
||||
#endif
|
||||
}
|
||||
|
||||
int&
|
||||
get_current_device()
|
||||
{
|
||||
static thread_local int _v = 1;
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::unordered_set<uint64_t>&
|
||||
get_roctracer_kernels()
|
||||
{
|
||||
static auto _v = std::unordered_set<uint64_t>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto&
|
||||
get_roctracer_hip_data(int64_t _tid = threading::get_id())
|
||||
{
|
||||
using data_t = std::unordered_map<uint64_t, roctracer_hip_bundle_t>;
|
||||
using thread_data_t = thread_data<data_t, category::roctracer>;
|
||||
return thread_data_t::instance(construct_on_thread{ _tid });
|
||||
}
|
||||
|
||||
std::unordered_map<uint64_t, const char*>&
|
||||
get_roctracer_key_data()
|
||||
{
|
||||
static auto _v = std::unordered_map<uint64_t, const char*>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::unordered_map<uint64_t, int64_t>&
|
||||
get_roctracer_tid_data()
|
||||
{
|
||||
static auto _v = std::unordered_map<uint64_t, int64_t>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto&
|
||||
get_hip_activity_callbacks(int64_t _tid = threading::get_id())
|
||||
{
|
||||
using thread_data_t =
|
||||
thread_data<std::vector<std::function<void()>>, category::roctracer>;
|
||||
return thread_data_t::instance(construct_on_thread{ _tid });
|
||||
}
|
||||
|
||||
size_t
|
||||
get_hip_activity_callbacks_size()
|
||||
{
|
||||
using thread_data_t =
|
||||
thread_data<std::vector<std::function<void()>>, category::roctracer>;
|
||||
return thread_data_t::size();
|
||||
}
|
||||
|
||||
using hip_activity_mutex_t = std::decay_t<decltype(get_hip_activity_callbacks())>;
|
||||
using key_data_mutex_t = std::decay_t<decltype(get_roctracer_key_data())>;
|
||||
|
||||
auto&
|
||||
get_hip_activity_mutex(int64_t _tid = threading::get_id())
|
||||
{
|
||||
return tim::type_mutex<hip_activity_mutex_t, category::roctracer,
|
||||
max_supported_threads, locking::atomic_mutex>(
|
||||
_tid % max_supported_threads);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
//
|
||||
int64_t
|
||||
get_clock_skew()
|
||||
{
|
||||
static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true);
|
||||
if(!_use) return 0;
|
||||
static auto _v = []() {
|
||||
auto _gpu_now = []() {
|
||||
uint64_t _ts = 0;
|
||||
roctracer_get_timestamp(&_ts);
|
||||
return _ts;
|
||||
};
|
||||
|
||||
// discard (warm-up)
|
||||
(void) tracing::get_clock_skew(_gpu_now, 1);
|
||||
|
||||
auto _diff = tracing::get_clock_skew(_gpu_now, 10);
|
||||
ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff,
|
||||
_use ? "yes" : "no");
|
||||
return _diff;
|
||||
}();
|
||||
return _v;
|
||||
}
|
||||
|
||||
// HSA API callback function
|
||||
void
|
||||
hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg)
|
||||
{
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
(void) arg;
|
||||
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
|
||||
ROCPROFSYS_CONDITIONAL_PRINT_F(
|
||||
get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n",
|
||||
roctracer_op_string(domain, cid, 0), cid, data->correlation_id,
|
||||
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
|
||||
|
||||
static thread_local int64_t begin_timestamp = 0;
|
||||
|
||||
switch(cid)
|
||||
{
|
||||
case HSA_API_ID_hsa_init:
|
||||
case HSA_API_ID_hsa_shut_down:
|
||||
case HSA_API_ID_hsa_agent_get_exception_policies:
|
||||
case HSA_API_ID_hsa_agent_get_info:
|
||||
case HSA_API_ID_hsa_amd_agent_iterate_memory_pools:
|
||||
case HSA_API_ID_hsa_amd_agent_memory_pool_get_info:
|
||||
case HSA_API_ID_hsa_amd_coherency_get_type:
|
||||
case HSA_API_ID_hsa_amd_memory_pool_get_info:
|
||||
case HSA_API_ID_hsa_amd_pointer_info:
|
||||
case HSA_API_ID_hsa_amd_pointer_info_set_userdata:
|
||||
case HSA_API_ID_hsa_amd_profiling_async_copy_enable:
|
||||
case HSA_API_ID_hsa_amd_profiling_get_async_copy_time:
|
||||
case HSA_API_ID_hsa_amd_profiling_get_dispatch_time:
|
||||
case HSA_API_ID_hsa_amd_profiling_set_profiler_enabled:
|
||||
case HSA_API_ID_hsa_cache_get_info:
|
||||
case HSA_API_ID_hsa_code_object_get_info:
|
||||
case HSA_API_ID_hsa_code_object_get_symbol:
|
||||
case HSA_API_ID_hsa_code_object_get_symbol_from_name:
|
||||
case HSA_API_ID_hsa_code_object_reader_create_from_memory:
|
||||
case HSA_API_ID_hsa_code_symbol_get_info:
|
||||
case HSA_API_ID_hsa_executable_create_alt:
|
||||
case HSA_API_ID_hsa_executable_freeze:
|
||||
case HSA_API_ID_hsa_executable_get_info:
|
||||
case HSA_API_ID_hsa_executable_get_symbol:
|
||||
case HSA_API_ID_hsa_executable_get_symbol_by_name:
|
||||
case HSA_API_ID_hsa_executable_symbol_get_info:
|
||||
case HSA_API_ID_hsa_extension_get_name:
|
||||
case HSA_API_ID_hsa_ext_image_data_get_info:
|
||||
case HSA_API_ID_hsa_ext_image_data_get_info_with_layout:
|
||||
case HSA_API_ID_hsa_ext_image_get_capability:
|
||||
case HSA_API_ID_hsa_ext_image_get_capability_with_layout:
|
||||
case HSA_API_ID_hsa_isa_get_exception_policies:
|
||||
case HSA_API_ID_hsa_isa_get_info:
|
||||
case HSA_API_ID_hsa_isa_get_info_alt:
|
||||
case HSA_API_ID_hsa_isa_get_round_method:
|
||||
case HSA_API_ID_hsa_region_get_info:
|
||||
case HSA_API_ID_hsa_system_extension_supported:
|
||||
case HSA_API_ID_hsa_system_get_extension_table:
|
||||
case HSA_API_ID_hsa_system_get_info:
|
||||
case HSA_API_ID_hsa_system_get_major_extension_table:
|
||||
case HSA_API_ID_hsa_wavefront_get_info: break;
|
||||
default:
|
||||
{
|
||||
if(data->phase == ACTIVITY_API_PHASE_ENTER)
|
||||
{
|
||||
begin_timestamp = comp::wall_clock::record();
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto* _name = roctracer_op_string(domain, cid, 0);
|
||||
const auto end_timestamp = (cid == HSA_API_ID_hsa_shut_down)
|
||||
? begin_timestamp
|
||||
: comp::wall_clock::record();
|
||||
|
||||
if(begin_timestamp > end_timestamp) return;
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
uint64_t _beg_ts = begin_timestamp;
|
||||
uint64_t _end_ts = end_timestamp;
|
||||
tracing::push_perfetto_ts(category::rocm_hsa{}, _name, _beg_ts,
|
||||
[&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, "begin_ns", _beg_ts);
|
||||
}
|
||||
});
|
||||
tracing::pop_perfetto_ts(category::rocm_hsa{}, _name, _end_ts,
|
||||
[&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, "end_ns", _end_ts);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto _beg_ns = begin_timestamp;
|
||||
auto _end_ns = end_timestamp;
|
||||
if(tasking::roctracer::get_task_group().pool())
|
||||
tasking::roctracer::get_task_group().exec(
|
||||
[_name, _beg_ns, _end_ns]() {
|
||||
roctracer_hsa_bundle_t _bundle{ _name };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{},
|
||||
static_cast<double>(_end_ns - _beg_ns))
|
||||
.stop();
|
||||
});
|
||||
}
|
||||
// timemory is disabled in this callback because collecting data in this
|
||||
// thread causes strange segmentation faults
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hsa_activity_callback(uint32_t op, const void* vrecord, void* arg)
|
||||
{
|
||||
const auto* record = static_cast<const activity_record_t*>(vrecord);
|
||||
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
auto&& _protect = comp::roctracer::protect_flush_activity();
|
||||
(void) _protect;
|
||||
|
||||
static const char* copy_op_name = "hsa_async_copy";
|
||||
static const char* dispatch_op_name = "hsa_dispatch";
|
||||
static const char* barrier_op_name = "hsa_barrier";
|
||||
const char** _name = nullptr;
|
||||
|
||||
switch(op)
|
||||
{
|
||||
case HSA_OP_ID_DISPATCH: _name = &dispatch_op_name; break;
|
||||
case HSA_OP_ID_COPY: _name = ©_op_name; break;
|
||||
case HSA_OP_ID_BARRIER: _name = &barrier_op_name; break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
ROCPROFSYS_CI_FAIL(_name == nullptr, "Error! HSA operation type not handled: %u\n",
|
||||
op);
|
||||
|
||||
if(!_name) return;
|
||||
|
||||
auto _beg_ns = record->begin_ns + get_clock_skew();
|
||||
auto _end_ns = record->end_ns + get_clock_skew();
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
uint64_t _beg = _beg_ns;
|
||||
uint64_t _end = _end_ns;
|
||||
tracing::push_perfetto_ts(
|
||||
category::device_hsa{}, *_name, _beg, [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg);
|
||||
}
|
||||
});
|
||||
tracing::pop_perfetto_ts(
|
||||
category::device_hsa{}, *_name, _end, [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _end);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
auto _func = [_beg_ns, _end_ns, _name]() {
|
||||
if(get_use_timemory())
|
||||
{
|
||||
roctracer_hsa_bundle_t _bundle{ *_name };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
|
||||
.stop();
|
||||
}
|
||||
};
|
||||
|
||||
if(tasking::roctracer::get_task_group().pool())
|
||||
tasking::roctracer::get_task_group().exec(_func);
|
||||
|
||||
// timemory is disabled in this callback because collecting data in this thread
|
||||
// causes strange segmentation faults
|
||||
tim::consume_parameters(arg);
|
||||
}
|
||||
|
||||
void
|
||||
hip_exec_activity_callbacks(int64_t _tid)
|
||||
{
|
||||
// guard against initialization of structure when trying to exec
|
||||
if(static_cast<size_t>(_tid) >= get_hip_activity_callbacks_size()) return;
|
||||
|
||||
// ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity());
|
||||
locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) };
|
||||
auto& _async_ops = get_hip_activity_callbacks(_tid);
|
||||
if(!_async_ops) return;
|
||||
for(auto& itr : *_async_ops)
|
||||
{
|
||||
if(itr) itr();
|
||||
}
|
||||
_async_ops->clear();
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
thread_local std::unordered_map<size_t, size_t> gpu_crit_cids = {};
|
||||
}
|
||||
|
||||
void
|
||||
roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data,
|
||||
void* /*arg*/)
|
||||
{
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
if(domain != ACTIVITY_DOMAIN_ROCTX) return;
|
||||
|
||||
static auto _range_map = std::unordered_map<roctx_range_id_t, std::string>{};
|
||||
static auto _range_lock = locking::atomic_mutex{};
|
||||
const auto* _data = reinterpret_cast<const roctx_api_data_t*>(callback_data);
|
||||
static thread_local auto _range_stack = std::vector<std::string>{};
|
||||
|
||||
switch(cid)
|
||||
{
|
||||
case ROCTX_API_ID_roctxRangePushA:
|
||||
{
|
||||
if(_data->args.message)
|
||||
{
|
||||
auto& itr = _range_stack.emplace_back(std::string{ _data->args.message });
|
||||
component::category_region<category::rocm_roctx>::start(itr.c_str());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROCTX_API_ID_roctxRangePop:
|
||||
{
|
||||
if(!_range_stack.empty())
|
||||
{
|
||||
auto& itr = _range_stack.back();
|
||||
component::category_region<category::rocm_roctx>::stop(itr.c_str());
|
||||
_range_stack.pop_back();
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCPROFSYS_THROW("Error! roctxRangePop stack is empty! Expected "
|
||||
"roctxRangePush/roctxRangePop on same thread\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROCTX_API_ID_roctxRangeStartA:
|
||||
{
|
||||
{
|
||||
locking::atomic_lock _lk{ _range_lock, std::defer_lock };
|
||||
if(!_lk.owns_lock()) _lk.lock();
|
||||
_range_map.emplace(roctx_range_id_t{ _data->args.id },
|
||||
std::string{ _data->args.message });
|
||||
}
|
||||
|
||||
component::category_region<category::rocm_roctx>::start(_data->args.message);
|
||||
break;
|
||||
}
|
||||
case ROCTX_API_ID_roctxRangeStop:
|
||||
{
|
||||
std::string_view _message = {};
|
||||
{
|
||||
locking::atomic_lock _lk{ _range_lock, std::defer_lock };
|
||||
if(!_lk.owns_lock()) _lk.lock();
|
||||
auto itr = _range_map.find(roctx_range_id_t{ _data->args.id });
|
||||
ROCPROFSYS_CI_THROW(itr == _range_map.end(),
|
||||
"Error! could not find range with id %lu\n",
|
||||
_data->args.id);
|
||||
if(itr == _range_map.end())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(0, "Warning! could not find range with id %lu\n",
|
||||
_data->args.id);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
_message = itr->second;
|
||||
}
|
||||
}
|
||||
|
||||
if(!_message.empty())
|
||||
{
|
||||
component::category_region<category::rocm_roctx>::stop(_message.data());
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case ROCTX_API_ID_roctxMarkA:
|
||||
{
|
||||
if(_data->args.message)
|
||||
{
|
||||
component::category_region<category::rocm_roctx>::mark(
|
||||
_data->args.message);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
// HIP API callback function
|
||||
void
|
||||
hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg)
|
||||
{
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
assert(domain == ACTIVITY_DOMAIN_HIP_API);
|
||||
const char* op_name = roctracer_op_string(domain, cid, 0);
|
||||
if(op_name == nullptr) op_name = hip_api_name(cid);
|
||||
if(op_name == nullptr) return;
|
||||
assert(std::string{ op_name } == std::string{ hip_api_name(cid) });
|
||||
|
||||
switch(cid)
|
||||
{
|
||||
case HIP_API_ID___hipPushCallConfiguration:
|
||||
case HIP_API_ID___hipPopCallConfiguration:
|
||||
case HIP_API_ID_hipDeviceEnablePeerAccess:
|
||||
#if ROCPROFSYS_HIP_VERSION_MAJOR > 4 || \
|
||||
(ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 3)
|
||||
case HIP_API_ID_hipImportExternalMemory:
|
||||
case HIP_API_ID_hipDestroyExternalMemory:
|
||||
#endif
|
||||
return;
|
||||
default: break;
|
||||
}
|
||||
|
||||
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
|
||||
ROCPROFSYS_CONDITIONAL_PRINT_F(
|
||||
get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n",
|
||||
op_name, cid, data->correlation_id,
|
||||
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
|
||||
|
||||
int64_t _ts = comp::wall_clock::record();
|
||||
auto _tid = threading::get_id();
|
||||
uint64_t _crit_cid = 0;
|
||||
uint64_t _parent_crit_cid = 0;
|
||||
uint32_t _depth = 0;
|
||||
auto _roct_cid = data->correlation_id;
|
||||
|
||||
auto& _device_id = get_current_device();
|
||||
|
||||
if(data->phase == ACTIVITY_API_PHASE_ENTER)
|
||||
{
|
||||
if(cid == HIP_API_ID_hipSetDevice)
|
||||
get_current_device() =
|
||||
reinterpret_cast<int>(data->args.hipSetDevice.deviceId) + 1;
|
||||
|
||||
const char* _name = nullptr;
|
||||
switch(cid)
|
||||
{
|
||||
case HIP_API_ID_hipLaunchKernel:
|
||||
{
|
||||
_name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipLaunchCooperativeKernel:
|
||||
{
|
||||
_name =
|
||||
hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f,
|
||||
data->args.hipLaunchCooperativeKernel.stream);
|
||||
if(!_name)
|
||||
{
|
||||
_name =
|
||||
hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipHccModuleLaunchKernel:
|
||||
{
|
||||
_name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f);
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipModuleLaunchKernel:
|
||||
{
|
||||
_name = hipKernelNameRef(data->args.hipModuleLaunchKernel.f);
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipExtModuleLaunchKernel:
|
||||
{
|
||||
_name = hipKernelNameRef(data->args.hipExtModuleLaunchKernel.f);
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipExtLaunchKernel:
|
||||
{
|
||||
_name =
|
||||
hipKernelNameRefByPtr(data->args.hipExtLaunchKernel.function_address,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
|
||||
if(_name != nullptr)
|
||||
{
|
||||
if(get_use_perfetto() || get_use_timemory() || get_use_rocm_smi())
|
||||
{
|
||||
locking::atomic_lock _lk{ roctracer_type_mutex<key_data_mutex_t>() };
|
||||
get_roctracer_key_data().emplace(_roct_cid, _name);
|
||||
get_roctracer_tid_data().emplace(_roct_cid, _tid);
|
||||
}
|
||||
}
|
||||
|
||||
std::tie(_crit_cid, _parent_crit_cid, _depth) = create_cpu_cid_entry();
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
static auto _compact_annotations =
|
||||
config::get_setting_value<bool>(
|
||||
"ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS")
|
||||
.value_or(false);
|
||||
|
||||
static auto _enable_backtraces =
|
||||
config::get_setting_value<bool>("ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE")
|
||||
.value_or(false);
|
||||
|
||||
constexpr size_t bt_stack_depth = 16;
|
||||
constexpr size_t bt_ignore_depth = 3;
|
||||
constexpr bool bt_with_signal_frame = true;
|
||||
|
||||
using backtrace_entry_vec_t = std::vector<tim::unwind::processed_entry>;
|
||||
auto _bt_data = std::optional<backtrace_entry_vec_t>{};
|
||||
if(_enable_backtraces && config::get_perfetto_annotations())
|
||||
{
|
||||
auto _backtrace = tim::get_unw_stack<bt_stack_depth, bt_ignore_depth,
|
||||
bt_with_signal_frame>();
|
||||
_bt_data = backtrace_entry_vec_t{};
|
||||
_bt_data->reserve(_backtrace.size());
|
||||
for(auto itr : _backtrace)
|
||||
{
|
||||
if(itr)
|
||||
{
|
||||
if(auto _val = binary::lookup_ipaddr_entry<false>(itr->address());
|
||||
_val)
|
||||
{
|
||||
_bt_data->emplace_back(std::move(*_val));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto _api_id = static_cast<hip_api_id_t>(cid);
|
||||
tracing::push_perfetto_ts(
|
||||
category::rocm_hip{}, op_name, _ts,
|
||||
::perfetto::Flow::ProcessScoped(_roct_cid),
|
||||
[&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _ts);
|
||||
tracing::add_perfetto_annotation(ctx, "cid", _crit_cid);
|
||||
tracing::add_perfetto_annotation(ctx, "pcid", _parent_crit_cid);
|
||||
tracing::add_perfetto_annotation(ctx, "device", _device_id);
|
||||
tracing::add_perfetto_annotation(ctx, "tid", _tid);
|
||||
tracing::add_perfetto_annotation(ctx, "depth", _depth);
|
||||
tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid);
|
||||
if(_compact_annotations)
|
||||
{
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, "args", hip_api_string(_api_id, data));
|
||||
}
|
||||
else
|
||||
{
|
||||
auto _args = std::string{ hip_api_string(_api_id, data) };
|
||||
if(!_args.empty())
|
||||
{
|
||||
for(auto itr : tim::delimit(_args, ","))
|
||||
{
|
||||
if(itr.empty()) continue;
|
||||
auto _bpos = itr.find_first_not_of(' ');
|
||||
auto _epos = itr.find_last_not_of(' ');
|
||||
if(_epos > _bpos)
|
||||
itr = itr.substr(_bpos, (_epos - _bpos) + 1);
|
||||
auto _pos = itr.find('=');
|
||||
if(_pos != std::string::npos)
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, itr.substr(0, _pos),
|
||||
itr.substr(_pos + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(_enable_backtraces && _bt_data && !_bt_data->empty())
|
||||
{
|
||||
const std::string _unk = "??";
|
||||
size_t _bt_cnt = 0;
|
||||
for(const auto& itr : *_bt_data)
|
||||
{
|
||||
const auto* _func =
|
||||
(itr.name.empty()) ? &_unk : &itr.name;
|
||||
const auto* _loc =
|
||||
(itr.location.empty()) ? &_unk : &itr.location;
|
||||
auto _line = (itr.lineno == 0) ? std::string{ "?" }
|
||||
: join("", itr.lineno);
|
||||
auto _entry = join("", demangle(*_func), " @ ",
|
||||
join(':', *_loc, _line));
|
||||
if(_bt_cnt < 10)
|
||||
{
|
||||
// Prepend zero for better ordering in UI.
|
||||
// Only one zero is ever necessary since stack depth
|
||||
// is limited to 16.
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, join("", "frame#0", _bt_cnt++), _entry);
|
||||
}
|
||||
else
|
||||
{
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, join("", "frame#", _bt_cnt++), _entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto itr = get_roctracer_hip_data()->emplace(
|
||||
_roct_cid, roctracer_hip_bundle_t{ op_name });
|
||||
if(itr.second)
|
||||
{
|
||||
itr.first->second.start();
|
||||
}
|
||||
else if(itr.first != get_roctracer_hip_data()->end())
|
||||
{
|
||||
itr.first->second.stop();
|
||||
get_roctracer_hip_data()->erase(itr.first);
|
||||
}
|
||||
}
|
||||
|
||||
hip_exec_activity_callbacks(_tid);
|
||||
}
|
||||
else if(data->phase == ACTIVITY_API_PHASE_EXIT)
|
||||
{
|
||||
hip_exec_activity_callbacks(_tid);
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
tracing::pop_perfetto_ts(
|
||||
category::rocm_hip{}, op_name, _ts, [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _ts);
|
||||
}
|
||||
});
|
||||
}
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto _stop = [&_roct_cid](int64_t _tid_v) {
|
||||
auto& _data = get_roctracer_hip_data(_tid_v);
|
||||
auto itr = _data->find(_roct_cid);
|
||||
if(itr != get_roctracer_hip_data()->end())
|
||||
{
|
||||
itr->second.stop();
|
||||
_data->erase(itr);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
if(!_stop(_tid))
|
||||
{
|
||||
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
|
||||
{
|
||||
if(_stop(i)) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tim::consume_parameters(arg);
|
||||
}
|
||||
|
||||
// Activity tracing callback
|
||||
void
|
||||
hip_activity_callback(const char* begin, const char* end, void* arg)
|
||||
{
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
auto&& _protect = comp::roctracer::protect_flush_activity();
|
||||
(void) _protect;
|
||||
|
||||
if(!trait::runtime_enabled<comp::roctracer>::get()) return;
|
||||
static auto _kernel_names = std::unordered_map<const char*, std::string>{};
|
||||
static auto _indexes = std::unordered_map<uint64_t, int>{};
|
||||
static auto _skip_barrier_packets =
|
||||
config::get_setting_value<bool>("ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS")
|
||||
.value_or(false);
|
||||
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
|
||||
const roctracer_record_t* end_record =
|
||||
reinterpret_cast<const roctracer_record_t*>(end);
|
||||
|
||||
auto&& _advance_record = [&record]() {
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_next_record(record, &record));
|
||||
};
|
||||
|
||||
while(record < end_record)
|
||||
{
|
||||
// make sure every iteration advances regardless of where return point happens
|
||||
scope::destructor _next_dtor{ _advance_record };
|
||||
|
||||
// ROCPROFSYS_CI will enable these asserts and should fail if something relevant
|
||||
// changes
|
||||
assert(HIP_OP_ID_DISPATCH == 0);
|
||||
assert(HIP_OP_ID_COPY == 1);
|
||||
assert(HIP_OP_ID_BARRIER == 2);
|
||||
|
||||
if(record->domain == ACTIVITY_DOMAIN_HSA_OPS)
|
||||
{
|
||||
hsa_activity_callback(record->op, record, arg);
|
||||
continue;
|
||||
}
|
||||
if(record->domain != ACTIVITY_DOMAIN_HIP_OPS) continue;
|
||||
if(record->op > HIP_OP_ID_BARRIER) continue;
|
||||
if(_skip_barrier_packets && record->op == HIP_OP_ID_BARRIER) continue;
|
||||
|
||||
const char* op_name =
|
||||
roctracer_op_string(record->domain, record->op, record->kind);
|
||||
auto _ns_skew = get_clock_skew();
|
||||
uint64_t _beg_ns = record->begin_ns + _ns_skew;
|
||||
uint64_t _end_ns = record->end_ns + _ns_skew;
|
||||
auto _roct_cid = record->correlation_id;
|
||||
|
||||
auto& _keys = get_roctracer_key_data();
|
||||
auto& _tids = get_roctracer_tid_data();
|
||||
|
||||
int64_t _tid = 0; // thread id
|
||||
int32_t _devid = record->device_id; // device id
|
||||
int64_t _queid = record->queue_id; // queue id
|
||||
uintptr_t _queue = 0; // Host queue (stream)
|
||||
const char* _name = nullptr;
|
||||
bool _found = false;
|
||||
|
||||
{
|
||||
locking::atomic_lock _lk{ roctracer_type_mutex<key_data_mutex_t>() };
|
||||
if(_tids.find(_roct_cid) != _tids.end())
|
||||
{
|
||||
_found = true;
|
||||
_tid = _tids.at(_roct_cid);
|
||||
auto itr = _keys.find(_roct_cid);
|
||||
if(itr != _keys.end()) _name = itr->second;
|
||||
}
|
||||
}
|
||||
|
||||
if(_name == nullptr && op_name == nullptr) continue;
|
||||
if(_name == nullptr) _name = op_name;
|
||||
|
||||
static auto _op_id_names =
|
||||
std::array<const char*, 3>{ "DISPATCH", "COPY", "BARRIER" };
|
||||
|
||||
if(_end_ns < _beg_ns)
|
||||
{
|
||||
auto _verbose = []() { return get_verbose() >= 0 || get_debug(); };
|
||||
static size_t _n = 0;
|
||||
static size_t _nmax =
|
||||
get_env<size_t>("ROCPROFSYS_ROCTRACER_DISCARD_INVALID", 0);
|
||||
if(_nmax == 0) std::swap(_end_ns, _beg_ns);
|
||||
ROCPROFSYS_WARNING_IF_F(
|
||||
_n < _nmax && _verbose(),
|
||||
"%4zu :: Discarding kernel roctracer activity record which ended before "
|
||||
"it started :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) "
|
||||
"delta=%li, device=%d, queue=%lu, pid=%u, tid=%lu, op=%s\n",
|
||||
_n, op_name, _name, record->correlation_id, _beg_ns, _end_ns,
|
||||
(static_cast<int64_t>(_end_ns) - static_cast<int64_t>(_beg_ns)), _devid,
|
||||
_queid, record->process_id, _tid, _op_id_names.at(record->op));
|
||||
ROCPROFSYS_WARNING_IF_F(
|
||||
_nmax > 0 && _n == _nmax && _verbose(),
|
||||
"Suppressing future messages about discarding kernel roctracer activity "
|
||||
"record which ended before it started. Set "
|
||||
"ROCPROFSYS_ROCTRACER_DISCARD_INVALID=N to increase/decrease the number "
|
||||
"of messages. If N is set to 0, data will be included after swapping the "
|
||||
"begin and end values\n");
|
||||
if(_end_ns < _beg_ns)
|
||||
{
|
||||
++_n;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// execute this on this thread bc of how perfetto visualization works
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
if(_kernel_names.find(_name) == _kernel_names.end())
|
||||
_kernel_names.emplace(_name, tim::demangle(_name));
|
||||
|
||||
auto _track_desc = [](int32_t _device_id, int64_t _queue_id) {
|
||||
if(config::get_perfetto_roctracer_per_stream())
|
||||
return JOIN("", "HIP Activity Device ", _device_id, ", Queue ",
|
||||
_queue_id);
|
||||
return JOIN("", "HIP Activity Device ", _device_id);
|
||||
};
|
||||
|
||||
const auto _track = tracing::get_perfetto_track(
|
||||
category::device_hip{}, _track_desc, _devid,
|
||||
(get_perfetto_roctracer_per_stream()) ? _queid : 0);
|
||||
|
||||
assert(_end_ns >= _beg_ns);
|
||||
tracing::push_perfetto_track(
|
||||
category::device_hip{}, _kernel_names.at(_name).c_str(), _track, _beg_ns,
|
||||
::perfetto::Flow::ProcessScoped(_roct_cid),
|
||||
[&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid);
|
||||
tracing::add_perfetto_annotation(ctx, "device", _devid);
|
||||
tracing::add_perfetto_annotation(ctx, "queue", _queid);
|
||||
tracing::add_perfetto_annotation(ctx, "tid", _tid);
|
||||
tracing::add_perfetto_annotation(
|
||||
ctx, "stream", JOIN("", "0x", std::hex, _queue));
|
||||
tracing::add_perfetto_annotation(ctx, "op",
|
||||
_op_id_names.at(record->op));
|
||||
}
|
||||
});
|
||||
tracing::pop_perfetto_track(category::device_hip{}, "", _track, _end_ns);
|
||||
}
|
||||
|
||||
if(_found && _name != nullptr && get_use_timemory())
|
||||
{
|
||||
auto _func = [_beg_ns, _end_ns, _name]() {
|
||||
roctracer_hip_bundle_t _bundle{ _name };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
|
||||
.stop()
|
||||
.get<comp::wall_clock>([&](comp::wall_clock* wc) {
|
||||
wc->set_value(_end_ns - _beg_ns);
|
||||
wc->set_accum(_end_ns - _beg_ns);
|
||||
return wc;
|
||||
});
|
||||
_bundle.pop();
|
||||
};
|
||||
|
||||
auto& _async_ops = get_hip_activity_callbacks(_tid);
|
||||
locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) };
|
||||
_async_ops->emplace_back(std::move(_func));
|
||||
}
|
||||
}
|
||||
|
||||
// ensures that all the updates are written
|
||||
if(get_use_perfetto()) ::perfetto::TrackEvent::Flush();
|
||||
}
|
||||
|
||||
bool&
|
||||
roctracer_is_init()
|
||||
{
|
||||
static bool _v = tim::get_env("ROCPROFSYS_ROCTRACER_IS_INIT", false);
|
||||
return _v;
|
||||
}
|
||||
|
||||
bool&
|
||||
roctracer_is_setup()
|
||||
{
|
||||
static bool _v = false;
|
||||
return _v;
|
||||
}
|
||||
|
||||
using roctracer_functions_t = std::vector<std::pair<std::string, std::function<void()>>>;
|
||||
|
||||
roctracer_functions_t&
|
||||
roctracer_setup_routines()
|
||||
{
|
||||
static auto _v = roctracer_functions_t{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
roctracer_functions_t&
|
||||
roctracer_shutdown_routines()
|
||||
{
|
||||
static auto _v = roctracer_functions_t{};
|
||||
return _v;
|
||||
}
|
||||
} // namespace rocprofsys
|
||||
@@ -1,89 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/hip_runtime.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
// Macro to check ROC-tracer calls status
|
||||
#define ROCPROFSYS_ROCTRACER_CALL(call) \
|
||||
{ \
|
||||
ROCPROFSYS_DEBUG_F(#call); \
|
||||
int err = call; \
|
||||
if(err != 0) \
|
||||
{ \
|
||||
ROCPROFSYS_PRINT_F("%s in: %s\n", roctracer_error_string(), #call); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
using roctracer_hip_bundle_t =
|
||||
tim::component_bundle<category::rocm_hip, comp::roctracer_data, comp::wall_clock>;
|
||||
using roctracer_hsa_bundle_t =
|
||||
tim::component_bundle<category::rocm_hsa, comp::roctracer_data>;
|
||||
using roctracer_functions_t = std::vector<std::pair<std::string, std::function<void()>>>;
|
||||
|
||||
// HSA API callback function
|
||||
void
|
||||
hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
|
||||
|
||||
void
|
||||
hsa_activity_callback(uint32_t op, const void* record, void* arg);
|
||||
|
||||
void
|
||||
hip_exec_activity_callbacks(int64_t _tid);
|
||||
|
||||
// HIP API callback function
|
||||
void
|
||||
hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
|
||||
|
||||
void
|
||||
roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
|
||||
|
||||
// Activity tracing callback
|
||||
void
|
||||
hip_activity_callback(const char* begin, const char* end, void*);
|
||||
|
||||
bool&
|
||||
roctracer_is_init();
|
||||
|
||||
bool&
|
||||
roctracer_is_setup();
|
||||
|
||||
int64_t
|
||||
get_clock_skew();
|
||||
|
||||
roctracer_functions_t&
|
||||
roctracer_setup_routines();
|
||||
|
||||
roctracer_functions_t&
|
||||
roctracer_shutdown_routines();
|
||||
} // namespace rocprofsys
|
||||
@@ -33,7 +33,6 @@
|
||||
#include "library/components/mpi_gotcha.hpp"
|
||||
#include "library/components/numa_gotcha.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
|
||||
@@ -4,9 +4,7 @@
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
set(ROCPROFSYS_ROCM_EVENTS_TEST
|
||||
"GRBM_COUNT,GPUBusy,SQ_WAVES,SQ_INSTS_VALU,VALUInsts,TCC_HIT_sum,TA_TA_BUSY[0]:device=0,TA_TA_BUSY[11]:device=0"
|
||||
)
|
||||
set(ROCPROFSYS_ROCM_EVENTS_TEST "GRBM_COUNT,SQ_WAVES,SQ_INSTS_VALU,TA_TA_BUSY:device=0")
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
NAME transpose
|
||||
@@ -26,7 +24,8 @@ rocprofiler_systems_add_test(
|
||||
args
|
||||
-E
|
||||
uniform_int_distribution
|
||||
ENVIRONMENT "${_base_environment}")
|
||||
ENVIRONMENT "${_base_environment}"
|
||||
RUNTIME_TIMEOUT 480)
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_REWRITE SKIP_RUNTIME
|
||||
@@ -36,9 +35,7 @@ rocprofiler_systems_add_test(
|
||||
GPU ON
|
||||
NUM_PROCS 1
|
||||
RUN_ARGS 1 2 2
|
||||
ENVIRONMENT
|
||||
"${_base_environment};ROCPROFSYS_ROCTRACER_HSA_ACTIVITY=OFF;ROCPROFSYS_ROCTRACER_HSA_API=OFF"
|
||||
)
|
||||
ENVIRONMENT "${_base_environment}")
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_BASELINE SKIP_RUNTIME
|
||||
@@ -64,7 +61,11 @@ rocprofiler_systems_add_test(
|
||||
ENVIRONMENT "${_base_environment}"
|
||||
REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose")
|
||||
|
||||
if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
set(_ROCP_PASS_REGEX
|
||||
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-TA_TA_BUSY.txt(.*)"
|
||||
)
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_BASELINE SKIP_RUNTIME
|
||||
NAME transpose-rocprofiler
|
||||
@@ -76,22 +77,7 @@ if(ROCPROFSYS_USE_ROCPROFILER)
|
||||
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
|
||||
ENVIRONMENT
|
||||
"${_base_environment};ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
|
||||
)
|
||||
REWRITE_RUN_PASS_REGEX "${_ROCP_PASS_REGEX}"
|
||||
SAMPLING_PASS_REGEX "${_ROCP_PASS_REGEX}")
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_BASELINE SKIP_RUNTIME
|
||||
NAME transpose-rocprofiler-no-roctracer
|
||||
TARGET transpose
|
||||
LABELS "rocprofiler"
|
||||
MPI ${TRANSPOSE_USE_MPI}
|
||||
GPU ON
|
||||
NUM_PROCS ${NUM_PROCS}
|
||||
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
|
||||
ENVIRONMENT
|
||||
"${_base_environment};ROCPROFSYS_USE_ROCTRACER=OFF;ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
|
||||
REWRITE_RUN_FAIL_REGEX "roctracer.txt|ROCPROFSYS_ABORT_FAIL_REGEX")
|
||||
endif()
|
||||
|
||||
@@ -226,7 +226,7 @@ endif()
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
set(_VALID_GPU OFF)
|
||||
if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
|
||||
if(ROCPROFSYS_USE_ROCM AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
|
||||
set(_VALID_GPU ON)
|
||||
find_program(
|
||||
ROCPROFSYS_ROCM_SMI_EXE
|
||||
@@ -254,7 +254,7 @@ if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(LULESH_USE_GPU ${LULESH_USE_HIP})
|
||||
set(LULESH_USE_GPU ${LULESH_USE_ROCM})
|
||||
if(LULESH_USE_CUDA)
|
||||
set(LULESH_USE_GPU ON)
|
||||
endif()
|
||||
@@ -314,8 +314,6 @@ ROCPROFSYS_SAMPLING_FREQ = 300
|
||||
ROCPROFSYS_SAMPLING_DELAY = 0.05
|
||||
ROCPROFSYS_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS}
|
||||
ROCPROFSYS_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES
|
||||
ROCPROFSYS_ROCTRACER_HSA_API = ON
|
||||
ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = ON
|
||||
|
||||
# test-specific values
|
||||
${_FILE_CONTENTS}
|
||||
@@ -430,18 +428,18 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
if(TEST_GPU)
|
||||
list(APPEND TEST_LABELS "gpu")
|
||||
|
||||
if(NOT "ROCPROFSYS_USE_ROCTRACER=OFF" IN_LIST TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "roctracer")
|
||||
if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "rocm")
|
||||
endif()
|
||||
|
||||
if(NOT "ROCPROFSYS_USE_ROCM_SMI=OFF" IN_LIST TEST_ENVIRONMENT)
|
||||
if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "rocm-smi")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if("ROCPROFSYS_USE_ROCTRACER=ON" IN_LIST TEST_ENVIRONMENT AND NOT "roctracer" IN_LIST
|
||||
TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "roctracer")
|
||||
if("ROCPROFSYS_USE_ROCM=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm" IN_LIST
|
||||
TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "rocm")
|
||||
endif()
|
||||
|
||||
if("ROCPROFSYS_USE_ROCM_SMI=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm-smi" IN_LIST
|
||||
@@ -449,11 +447,6 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
list(APPEND TEST_LABELS "rocm-smi")
|
||||
endif()
|
||||
|
||||
if("ROCPROFSYS_USE_ROCPROFILER=ON" IN_LIST TEST_ENVIRONMENT
|
||||
AND NOT "rocprofiler" IN_LIST TEST_ENVIRONMENT)
|
||||
list(APPEND TEST_LABELS "rocprofiler")
|
||||
endif()
|
||||
|
||||
if(TARGET ${TEST_TARGET})
|
||||
if(DEFINED TEST_MPI
|
||||
AND ${TEST_MPI}
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする