Update to use rocprofiler-sdk (#55)

- Renames the CMake option "ROCPROFSYS_USE_HIP" to "ROCPROFSYS_USE_ROCM"
- Remove the "ROCPROFSYS_USE_ROCM_SMI option. Controlled with the "ROCPROFSYS_USE_ROCM" option, instead.
   - Runtime configuration can still toggle ROCPROFSYS_USE_ROCM_SMI to disable the sampling.
- Rename ROCPROFSYS_HIP_VERSION macro to ROCPROFSYS_ROCM_VERSION and remove blocks for `ROCPROFSYS_ROCM_VERSION < 60000`
- Remove ROCPROFSYS_USE_ROCTRACER and ROCPROFSYS_USE_ROCPROFILER
- Update test cases
- Update docker files and workflows to install cmake 3.21, which is required for the rocprofiler-sdk findPackage script.
- Removed rocm-6.2 from workflows due to a rocprofiler-sdk API change. 

[ROCm/rocprofiler-systems commit: 88aa2d3cbe]
このコミットが含まれているのは:
David Galiffi
2024-12-13 18:48:39 -05:00
committed by GitHub
コミット b29cfac106
87個のファイルの変更3842行の追加6261行の削除
+1 -39
ファイルの表示
@@ -39,12 +39,10 @@ jobs:
version: "15.5"
- distro: "opensuse"
version: "15.6"
- distro: "rhel"
version: "8.8"
- distro: "rhel"
version: "8.10"
- distro: "rhel"
version: "9.2"
version: "9.3"
- distro: "rhel"
version: "9.4"
@@ -90,9 +88,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "6.3"
@@ -100,9 +95,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "6.3"
@@ -110,9 +102,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "6.3"
@@ -120,9 +109,6 @@ jobs:
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "0.0"
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "6.2"
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "6.3"
@@ -130,43 +116,19 @@ jobs:
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "0.0"
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "6.2"
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "6.3"
# RHEL 8.9
- os-distro: "rhel"
os-version: "8.9"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "8.9"
rocm-version: "6.2"
# RHEL 8.10
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "6.2"
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "6.3"
# RHEL 9.3
- os-distro: "rhel"
os-version: "9.3"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "9.3"
rocm-version: "6.2"
# RHEL 9.4
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "6.2"
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "6.3"
-35
ファイルの表示
@@ -37,9 +37,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "20.04"
rocm-version: "6.3"
@@ -47,9 +44,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "22.04"
rocm-version: "6.3"
@@ -57,9 +51,6 @@ jobs:
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "0.0"
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "6.2"
- os-distro: "ubuntu"
os-version: "24.04"
rocm-version: "6.3"
@@ -67,9 +58,6 @@ jobs:
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "0.0"
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "6.2"
- os-distro: "opensuse"
os-version: "15.5"
rocm-version: "6.3"
@@ -77,43 +65,20 @@ jobs:
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "0.0"
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "6.2"
- os-distro: "opensuse"
os-version: "15.6"
rocm-version: "6.3"
# RHEL 8.9
- os-distro: "rhel"
os-version: "8.9"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "8.9"
rocm-version: "6.2"
# RHEL 8.10
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "6.2"
- os-distro: "rhel"
os-version: "8.10"
rocm-version: "6.3"
# RHEL 9.3
- os-distro: "rhel"
os-version: "9.3"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "9.3"
rocm-version: "6.2"
# RHEL 9.4
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "0.0"
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "6.2"
- os-distro: "rhel"
os-version: "9.4"
rocm-version: "6.3"
+2 -2
ファイルの表示
@@ -66,7 +66,7 @@ jobs:
fi
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Configure Env
@@ -93,7 +93,7 @@ jobs:
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
-DROCPROFSYS_BUILD_TESTING=ON
-DROCPROFSYS_USE_MPI=OFF
-DROCPROFSYS_USE_HIP=OFF
-DROCPROFSYS_USE_ROCM=OFF
-DROCPROFSYS_USE_OMPT=OFF
-DROCPROFSYS_USE_PYTHON=ON
-DROCPROFSYS_INSTALL_PERFETTO_TOOLS=OFF
+4 -4
ファイルの表示
@@ -46,8 +46,8 @@ jobs:
fail-fast: false
matrix:
compiler: ['g++']
os-release: [ '8.10', '9.2', '9.4' ]
rocm-version: [ '0.0', '6.2', '6.3' ]
os-release: [ '8.10', '9.3', '9.4' ]
rocm-version: [ '0.0', '6.3' ]
build-type: ['Release']
steps:
@@ -70,7 +70,7 @@ jobs:
fi
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Install ROCm Packages
@@ -108,7 +108,7 @@ jobs:
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
-DROCPROFSYS_BUILD_TESTING=ON
-DROCPROFSYS_USE_MPI=OFF
-DROCPROFSYS_USE_HIP=${USE_HIP}
-DROCPROFSYS_USE_ROCM=${USE_HIP}
-DROCPROFSYS_USE_OMPT=OFF
-DROCPROFSYS_USE_PYTHON=ON
-DROCPROFSYS_USE_MPI_HEADERS=ON
+10 -16
ファイルの表示
@@ -100,7 +100,7 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
@@ -145,7 +145,7 @@ jobs:
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems
-DROCPROFSYS_BUILD_TESTING=ON
-DROCPROFSYS_USE_MPI=OFF
-DROCPROFSYS_USE_HIP=OFF
-DROCPROFSYS_USE_ROCM=OFF
-DROCPROFSYS_USE_OMPT=OFF
-DROCPROFSYS_USE_PAPI=OFF
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
@@ -245,16 +245,10 @@ jobs:
fail-fast: false
matrix:
compiler: ['g++']
rocm-version: ['6.2']
rocm-version: ['6.3']
mpi-headers: ['OFF']
build-jobs: ['3']
ctest-exclude: ['-LE "mpi-example|transpose"']
include:
- compiler: 'g++'
rocm-version: 'latest'
mpi-headers: 'ON'
build-jobs: '2'
ctest-exclude: '-LE transpose'
ctest-exclude: ['-LE "transpose"']
env:
BUILD_TYPE: MinSizeRel
@@ -282,7 +276,7 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
@@ -336,7 +330,7 @@ jobs:
-DROCPROFSYS_BUILD_EXTRA_OPTIMIZATIONS=OFF
-DROCPROFSYS_BUILD_LTO=OFF
-DROCPROFSYS_USE_MPI=OFF
-DROCPROFSYS_USE_HIP=ON
-DROCPROFSYS_USE_ROCM=ON
-DROCPROFSYS_MAX_THREADS=64
-DROCPROFSYS_USE_PAPI=OFF
-DROCPROFSYS_USE_OMPT=OFF
@@ -440,7 +434,7 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
sudo apt-get -y --purge autoremove &&
sudo apt-get -y clean
@@ -477,7 +471,7 @@ jobs:
-DROCPROFSYS_BUILD_TESTING=ON
-DROCPROFSYS_BUILD_DYNINST=ON
-DROCPROFSYS_USE_MPI=${USE_MPI}
-DROCPROFSYS_USE_HIP=OFF
-DROCPROFSYS_USE_ROCM=OFF
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
-DROCPROFSYS_USE_OMPT=${{ matrix.ompt }}
-DROCPROFSYS_USE_PAPI=${{ matrix.papi }}
@@ -593,7 +587,7 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
@@ -625,7 +619,7 @@ jobs:
-DROCPROFSYS_USE_PYTHON=ON
-DROCPROFSYS_USE_OMPT=ON
-DROCPROFSYS_USE_PAPI=ON
-DROCPROFSYS_USE_HIP=OFF
-DROCPROFSYS_USE_ROCM=OFF
-DROCPROFSYS_USE_RCCL=OFF
-DROCPROFSYS_MAX_THREADS=64
-DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl"
+3 -18
ファイルの表示
@@ -75,22 +75,7 @@ jobs:
static-libgcc: 'OFF'
static-libstdcxx: 'OFF'
build-dyninst: 'OFF'
rocm-version: '6.2'
- compiler: 'g++'
hip: 'ON'
mpi: 'OFF'
ompt: 'OFF'
papi: 'OFF'
python: 'ON'
lto: 'OFF'
strip: 'OFF'
hidden: 'ON'
build-type: 'Release'
mpi-headers: 'OFF'
static-libgcc: 'OFF'
static-libstdcxx: 'OFF'
build-dyninst: 'OFF'
rocm-version: 'latest'
rocm-version: '6.3'
env:
OMPI_ALLOW_RUN_AS_ROOT: 1
@@ -116,7 +101,7 @@ jobs:
openmpi-bin python3-pip texinfo ${{ matrix.compiler }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
python3 -m pip install 'cmake==3.21' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Install ROCm Packages
@@ -183,7 +168,7 @@ jobs:
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems-dev
-DROCPROFSYS_BUILD_TESTING=ON
-DROCPROFSYS_USE_MPI=${{ matrix.mpi }}
-DROCPROFSYS_USE_HIP=${{ matrix.hip }}
-DROCPROFSYS_USE_ROCM=${{ matrix.hip }}
-DROCPROFSYS_USE_OMPT=${{ matrix.ompt }}
-DROCPROFSYS_USE_PAPI=${{ matrix.papi }}
-DROCPROFSYS_USE_PYTHON=${{ matrix.python }}
+1 -1
ファイルの表示
@@ -101,7 +101,7 @@ jobs:
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \
-DROCPROFSYS_BUILD_TESTING=ON \
-DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl" \
-DROCPROFSYS_USE_HIP=${USE_ROCM} \
-DROCPROFSYS_USE_ROCM=${USE_ROCM} \
-DRCOPROFSYS_USE_PYTHON=ON \
-DROCPROFSYS_STRIP_LIBRARIES=${{ matrix.strip }} \
-DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs \
+4 -39
ファイルの表示
@@ -176,18 +176,11 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_CLANG_TIDY "Enable clang-tidy" OFF
rocprofiler_systems_add_option(ROCPROFSYS_USE_BFD
"Enable BFD support (map call-stack samples to LOC)" ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF)
rocprofiler_systems_add_option(ROCPROFSYS_USE_HIP "Enable HIP support" ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI"
ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCTRACER "Enable roctracer support"
${ROCPROFSYS_USE_HIP})
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCPROFILER "Enable rocprofiler support"
${ROCPROFSYS_USE_HIP})
rocprofiler_systems_add_option(
ROCPROFSYS_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
${ROCPROFSYS_USE_HIP})
rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support"
${ROCPROFSYS_USE_HIP})
${ROCPROFSYS_USE_ROCM})
rocprofiler_systems_add_option(
ROCPROFSYS_USE_MPI_HEADERS
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
@@ -217,30 +210,10 @@ elseif("$ENV{ROCPROFSYS_CI}")
endif()
endif()
if(NOT ROCPROFSYS_USE_HIP)
set(ROCPROFSYS_USE_ROCTRACER
OFF
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
set(ROCPROFSYS_USE_ROCPROFILER
OFF
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
set(ROCPROFSYS_USE_ROCM_SMI
OFF
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
if(NOT ROCPROFSYS_USE_ROCM)
set(ROCPROFSYS_USE_RCCL
OFF
CACHE BOOL "Disabled via ROCPROFSYS_USE_HIP=OFF" FORCE)
elseif(
ROCPROFSYS_USE_HIP
AND NOT ROCPROFSYS_USE_ROCTRACER
AND NOT ROCPROFSYS_USE_ROCPROFILER
AND NOT ROCPROFSYS_USE_ROCM_SMI
AND NOT ROCPROFSYS_USE_RCCL)
rocprofiler_systems_message(
AUTHOR_WARNING
"Setting ROCPROFSYS_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
)
set(ROCPROFSYS_USE_HIP OFF)
CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE)
endif()
if(ROCPROFSYS_BUILD_TESTING)
@@ -378,14 +351,6 @@ endif()
#
# ------------------------------------------------------------------------------#
if(NOT ROCPROFSYS_USE_ROCTRACER AND NOT ROCPROFSYS_USE_ROCPROFILER)
set(ROCPROFSYS_HSA_ENV "# ")
endif()
if(NOT ROCPROFSYS_USE_ROCPROFILER)
set(ROCPROFSYS_ROCP_ENV "# ")
endif()
configure_file(
${PROJECT_SOURCE_DIR}/LICENSE
${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/doc/${PROJECT_NAME}/LICENSE
+3 -12
ファイルの表示
@@ -54,9 +54,7 @@ set(ROCPROFSYS_CPACK_SYSTEM_NAME
CACHE STRING "System name, e.g. Linux or Ubuntu-20.04")
set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX "")
if(ROCPROFSYS_USE_HIP
OR ROCPROFSYS_USE_ROCTRACER
OR ROCPROFSYS_USE_ROCM_SMI)
if(ROCPROFSYS_USE_ROCM)
set(ROCPROFSYS_CPACK_PACKAGE_SUFFIX
"${ROCPROFSYS_CPACK_PACKAGE_SUFFIX}-ROCm-${ROCmVersion_NUMERIC_VERSION}")
endif()
@@ -159,19 +157,12 @@ if(NOT ROCPROFSYS_BUILD_DYNINST)
endif()
endif()
if(ROCmVersion_FOUND)
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCM_SMI_SUFFIX
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
endif()
if(ROCPROFSYS_USE_ROCM_SMI)
if(ROCPROFSYS_USE_ROCM)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib${_ROCM_SMI_SUFFIX}")
endif()
if(ROCPROFSYS_USE_ROCTRACER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
endif()
if(ROCPROFSYS_USE_ROCPROFILER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-sdk (>= ${rocprofiler-sdk_VERSION})")
endif()
if(ROCPROFSYS_USE_MPI)
if("${ROCPROFSYS_MPI_IMPL}" STREQUAL "openmpi")
+16 -16
ファイルの表示
@@ -109,13 +109,6 @@ set(_ROCPROFSYS_PAPI_COMPONENTS
)
if(ROCPROFSYS_PAPI_AUTO_COMPONENTS)
# rocm
if(ROCPROFSYS_USE_HIP
OR ROCPROFSYS_USE_ROCTRACER
OR ROCPROFSYS_USE_ROCM_SMI)
list(APPEND _ROCPROFSYS_PAPI_COMPONENTS rocm)
endif()
# lmsensors
find_path(ROCPROFSYS_PAPI_LMSENSORS_ROOT_DIR NAMES include/sensors/sensors.h
include/sensors.h)
@@ -209,28 +202,35 @@ externalproject_add(
BUILD_IN_SOURCE 1
PATCH_COMMAND
${CMAKE_COMMAND} -E env CC=${PAPI_C_COMPILER}
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation LIBS=-lrt LDFLAGS=-lrt
${ROCPROFSYS_PAPI_EXTRA_ENV} <SOURCE_DIR>/configure --quiet
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free LIBS=-lrt
LDFLAGS=-lrt ${ROCPROFSYS_PAPI_EXTRA_ENV} <SOURCE_DIR>/configure --quiet
--prefix=${ROCPROFSYS_PAPI_INSTALL_DIR} --with-static-lib=yes --with-shared-lib=no
--with-perf-events --with-tests=no
--with-components=${_ROCPROFSYS_PAPI_COMPONENTS}
--libdir=${ROCPROFSYS_PAPI_INSTALL_DIR}/lib
CONFIGURE_COMMAND
${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
${CMAKE_COMMAND} -E env
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s -j
${ROCPROFSYS_PAPI_CONFIGURE_JOBS}
BUILD_COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
BUILD_COMMAND
${CMAKE_COMMAND} -E env
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
INSTALL_COMMAND ""
BUILD_BYPRODUCTS "${_ROCPROFSYS_PAPI_BUILD_BYPRODUCTS}")
# target for re-executing the installation
add_custom_target(
rocprofiler-systems-papi-install
COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s
COMMAND ${CMAKE_COMMAND} -E env CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
COMMAND
${CMAKE_COMMAND} -E env
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} static install -s
COMMAND
${CMAKE_COMMAND} -E env
CFLAGS=-fPIC\ -O3\ -Wno-stringop-truncation\ -Wno-use-after-free
${ROCPROFSYS_PAPI_EXTRA_ENV} ${MAKE_EXECUTABLE} utils install-utils -s
WORKING_DIRECTORY ${ROCPROFSYS_PAPI_SOURCE_DIR}/src
COMMENT "Installing PAPI...")
+26 -68
ファイルの表示
@@ -15,14 +15,12 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-threading
rocprofiler_systems_add_interface_library(
rocprofiler-systems-dyninst
"Provides flags and libraries for Dyninst (dynamic instrumentation)")
rocprofiler_systems_add_interface_library(rocprofiler-systems-hip
"Provides flags and libraries for HIP")
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm
"Provides flags and libraries for ROCm")
rocprofiler_systems_add_interface_library(rocprofiler-systems-roctracer
"Provides flags and libraries for roctracer")
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocprofiler
"Provides flags and libraries for rocprofiler")
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm-smi
"Provides flags and libraries for rocm-smi")
rocprofiler_systems_add_interface_library(
rocprofiler-systems-rccl
"Provides flags for ROCm Communication Collectives Library (RCCL)")
@@ -50,10 +48,7 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition
# libraries with relevant compile definitions
set(ROCPROFSYS_EXTENSION_LIBRARIES
rocprofiler-systems::rocprofiler-systems-hip
rocprofiler-systems::rocprofiler-systems-roctracer
rocprofiler-systems::rocprofiler-systems-rocprofiler
rocprofiler-systems::rocprofiler-systems-rocm-smi
rocprofiler-systems::rocprofiler-systems-rocm
rocprofiler-systems::rocprofiler-systems-rccl
rocprofiler-systems::rocprofiler-systems-bfd
rocprofiler-systems::rocprofiler-systems-mpi
@@ -127,14 +122,11 @@ endforeach()
# ----------------------------------------------------------------------------------------#
#
# hip version
# ROCm Version
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_HIP
OR ROCPROFSYS_USE_ROCTRACER
OR ROCPROFSYS_USE_ROCPROFILER
OR ROCPROFSYS_USE_ROCM_SMI)
if(ROCPROFSYS_USE_ROCM)
find_package(ROCmVersion)
if(NOT ROCmVersion_FOUND)
@@ -164,13 +156,13 @@ if(ROCPROFSYS_USE_HIP
endif()
set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_FULL_VERSION})
set(ROCPROFSYS_HIP_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION})
set(ROCPROFSYS_HIP_VERSION_MINOR ${ROCmVersion_MINOR_VERSION})
set(ROCPROFSYS_HIP_VERSION_PATCH ${ROCmVersion_PATCH_VERSION})
set(ROCPROFSYS_HIP_VERSION ${ROCmVersion_TRIPLE_VERSION})
set(ROCPROFSYS_ROCM_VERSION_MAJOR ${ROCmVersion_MAJOR_VERSION})
set(ROCPROFSYS_ROCM_VERSION_MINOR ${ROCmVersion_MINOR_VERSION})
set(ROCPROFSYS_ROCM_VERSION_PATCH ${ROCmVersion_PATCH_VERSION})
set(ROCPROFSYS_ROCM_VERSION ${ROCmVersion_TRIPLE_VERSION})
if(ROCPROFSYS_HIP_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_HIP_VERSION_MINOR
GREATER 3)
if(ROCPROFSYS_ROCM_VERSION_MAJOR GREATER_EQUAL 4 AND ROCPROFSYS_ROCM_VERSION_MINOR
GREATER 3)
set(roctracer_kfdwrapper_LIBRARY)
endif()
@@ -181,64 +173,30 @@ if(ROCPROFSYS_USE_HIP
rocprofiler_systems_add_feature(ROCPROFSYS_ROCM_VERSION
"ROCm version used by rocprofiler-systems")
else()
set(ROCPROFSYS_HIP_VERSION "0.0.0")
set(ROCPROFSYS_HIP_VERSION_MAJOR 0)
set(ROCPROFSYS_HIP_VERSION_MINOR 0)
set(ROCPROFSYS_HIP_VERSION_PATCH 0)
set(ROCPROFSYS_ROCM_VERSION "0.0.0")
set(ROCPROFSYS_ROCM_VERSION_MAJOR 0)
set(ROCPROFSYS_ROCM_VERSION_MINOR 0)
set(ROCPROFSYS_ROCM_VERSION_PATCH 0)
endif()
# ----------------------------------------------------------------------------------------#
#
# HIP
# ROCm
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_HIP)
find_package(hip ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-hip
INTERFACE ROCPROFSYS_USE_HIP)
target_link_libraries(rocprofiler-systems-hip INTERFACE hip::host)
endif()
if(ROCPROFSYS_USE_ROCM)
find_package(rocprofiler-sdk ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm
INTERFACE ROCPROFSYS_USE_ROCM)
target_link_libraries(rocprofiler-systems-rocm
INTERFACE rocprofiler-sdk::rocprofiler-sdk)
# ----------------------------------------------------------------------------------------#
#
# roctracer
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_ROCTRACER)
find_package(roctracer ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-roctracer
INTERFACE ROCPROFSYS_USE_ROCTRACER)
target_link_libraries(
rocprofiler-systems-roctracer
INTERFACE roctracer::roctracer rocprofiler-systems::rocprofiler-systems-hip)
endif()
# ----------------------------------------------------------------------------------------#
#
# rocprofiler
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_ROCPROFILER)
find_package(rocprofiler ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocprofiler
INTERFACE ROCPROFSYS_USE_ROCPROFILER)
target_link_libraries(rocprofiler-systems-rocprofiler
INTERFACE rocprofiler::rocprofiler)
endif()
# ----------------------------------------------------------------------------------------#
#
# rocm-smi
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_ROCM_SMI)
find_package(rocm-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rocm-smi
INTERFACE ROCPROFSYS_USE_ROCM_SMI)
target_link_libraries(rocprofiler-systems-rocm-smi INTERFACE rocm-smi::rocm-smi)
target_link_libraries(rocprofiler-systems-rocm INTERFACE rocm-smi::rocm-smi)
# find_package(amd-smi ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
# target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi)
endif()
# ----------------------------------------------------------------------------------------#
-4
ファイルの表示
@@ -14,7 +14,3 @@ prepend-path PATH "${ROOT}/bin"
prepend-path LD_LIBRARY_PATH "${ROOT}/@CMAKE_INSTALL_LIBDIR@"
prepend-path PYTHONPATH "${ROOT}/@CMAKE_INSTALL_PYTHONDIR@"
setenv @PROJECT_NAME_UNDERSCORED@_DIR "${ROOT}/@CMAKE_INSTALL_DATAROOTDIR@/cmake/@PROJECT_NAME@"
# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
# @ROCPROFSYS_HSA_ENV@setenv HSA_TOOLS_REPORT_LOAD_FAILURE 1
# @ROCPROFSYS_ROCP_ENV@setenv ROCP_TOOL_LIB "${ROOT}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
-9
ファイルの表示
@@ -26,12 +26,3 @@ export LD_LIBRARY_PATH
export PYTHONPATH
export CMAKE_PREFIX_PATH
export @PROJECT_NAME_UNDERSCORED@_DIR
# ROCm environment variables
# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys-dl@CMAKE_SHARED_LIBRARY_SUFFIX@"
# @ROCPROFSYS_HSA_ENV@HSA_TOOLS_REPORT_LOAD_FAILURE=1
# @ROCPROFSYS_ROCP_ENV@ROCP_TOOL_LIB="${BASEDIR}/@CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@rocprof-sys@CMAKE_SHARED_LIBRARY_SUFFIX@"
# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_LIB
# @ROCPROFSYS_HSA_ENV@export HSA_TOOLS_REPORT_LOAD_FAILURE
# @ROCPROFSYS_ROCP_ENV@export ROCP_TOOL_LIB
+1 -1
ファイルの表示
@@ -25,7 +25,7 @@ RUN zypper --non-interactive update -y && \
zypper --non-interactive install -y -t pattern devel_basis && \
zypper --non-interactive install -y binutils-gold cmake curl dpkg-devel \
gcc-c++ git libnuma-devel openmpi3-devel python3-pip rpm-build wget && \
python3 -m pip install 'cmake==3.18.4'
python3 -m pip install 'cmake==3.21'
ARG ROCM_VERSION=0.0
ARG AMDGPU_RPM=6.2/sle/15.6/amdgpu-install-6.2.60200-1.noarch.rpm
+1 -1
ファイルの表示
@@ -31,7 +31,7 @@ RUN zypper --non-interactive update -y && \
gcc-c++ git libnuma-devel openmpi3-devel papi-devel python3-pip \
rpm-build wget && \
zypper --non-interactive clean --all && \
python3 -m pip install 'cmake==3.18.4'
python3 -m pip install 'cmake==3.21'
COPY ./dyninst-source /tmp/dyninst
+1 -1
ファイルの表示
@@ -18,7 +18,7 @@ RUN yum groupinstall -y "Development Tools" && \
yum install -y --allowerasing cmake curl dpkg-devel numactl-devel openmpi-devel \
papi-devel python3-pip texinfo wget which zlib-devel && \
yum clean all && \
python3 -m pip install 'cmake==3.18.4'
python3 -m pip install 'cmake==3.21'
ARG ROCM_VERSION=0.0
ARG AMDGPU_RPM=6.2/rhel/9.4/amdgpu-install-6.2.60202-1.el9.noarch.rpm
+1 -1
ファイルの表示
@@ -22,7 +22,7 @@ RUN yum groupinstall -y "Development Tools" && \
yum install -y --allowerasing cmake curl dpkg-devel numactl-devel \
openmpi-devel papi-devel python3-pip texinfo wget which zlib-devel && \
yum clean all && \
python3 -m pip install 'cmake==3.18.4'
python3 -m pip install 'cmake==3.21'
COPY ./dyninst-source /tmp/dyninst
+2 -2
ファイルの表示
@@ -30,9 +30,9 @@ RUN apt-get update && \
python3-pip rpm texinfo wget && \
OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed 's/=/ /'1 | awk '{print $NF}' | sed 's/"//g') && \
if [ "${OS_VERSION}" == "24.04" ]; then \
python3 -m pip install --break-system-packages 'cmake==3.18.4'; \
python3 -m pip install --break-system-packages 'cmake==3.21'; \
else \
python3 -m pip install 'cmake==3.18.4'; \
python3 -m pip install 'cmake==3.21'; \
fi
RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \
+2 -2
ファイルの表示
@@ -31,9 +31,9 @@ RUN apt-get update && \
python3-pip texinfo unzip wget zip zlib1g-dev && \
apt-get autoclean && \
if [ "${OS_VERSION}" == "24.04" ]; then \
python3 -m pip install --break-system-packages 'cmake==3.18.4' \
python3 -m pip install --break-system-packages 'cmake==3.21' \
else \
python3 -m pip install 'cmake==3.18.4'; \
python3 -m pip install 'cmake==3.21'; \
fi
COPY ./dyninst-source /tmp/dyninst
+4 -10
ファイルの表示
@@ -228,7 +228,7 @@ Generating a default configuration file
ROCPROFSYS_PROFILE = false
ROCPROFSYS_USE_SAMPLING = false
ROCPROFSYS_USE_PROCESS_SAMPLING = true
ROCPROFSYS_USE_ROCTRACER = true
ROCPROFSYS_USE_ROCM = true
ROCPROFSYS_USE_ROCM_SMI = true
ROCPROFSYS_USE_KOKKOSP = false
ROCPROFSYS_USE_CODE_COVERAGE = false
@@ -248,9 +248,6 @@ Generating a default configuration file
ROCPROFSYS_PERFETTO_FILE = perfetto-trace.proto
ROCPROFSYS_PERFETTO_FILL_POLICY = discard
ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB = 4096
ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = false
ROCPROFSYS_ROCTRACER_HSA_API = false
ROCPROFSYS_ROCTRACER_HSA_API_TYPES =
ROCPROFSYS_SAMPLING_CPUS =
ROCPROFSYS_SAMPLING_DELAY = 0.5
ROCPROFSYS_SAMPLING_FREQ = 10
@@ -363,13 +360,10 @@ Viewing the setting descriptions
| ROCPROFSYS_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... |
| ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
| ROCPROFSYS_PRECISION | Set the global output precision for ... |
| ROCPROFSYS_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
| ROCPROFSYS_ROCTRACER_HSA_API | Enable HSA API tracing support |
| ROCPROFSYS_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
| ROCPROFSYS_SAMPLING_CPUS | CPUs to collect frequency informatio... |
| ROCPROFSYS_SAMPLING_DELAY | Number of seconds to wait before the... |
| ROCPROFSYS_SAMPLING_FREQ | Number of software interrupts per se... |
| ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE_... |
| ROCPROFSYS_SAMPLING_GPUS | Devices to query when ROCPROFSYS_USE... |
| ROCPROFSYS_SCIENTIFIC | Set the global numerical reporting t... |
| ROCPROFSYS_STRICT_CONFIG | Throw errors for unknown setting nam... |
| ROCPROFSYS_SUPPRESS_CONFIG | Disable processing of setting config... |
@@ -391,13 +385,13 @@ Viewing the setting descriptions
| ROCPROFSYS_TRACE | Enable perfetto backend |
| ROCPROFSYS_USE_PID | Enable tagging filenames with proces... |
| ROCPROFSYS_USE_ROCM_SMI | Enable sampling GPU power, temp, uti... |
| ROCPROFSYS_USE_ROCTRACER | Enable ROCM tracing |
| ROCPROFSYS_USE_ROCM | Enable ROCM tracing |
| ROCPROFSYS_USE_SAMPLING | Enable statistical sampling of call-... |
| ROCPROFSYS_USE_PROCESS_SAMPLING | Enable a background thread which sam... |
| ROCPROFSYS_PROFILE | Enable timemory backend |
| ROCPROFSYS_VERBOSE | Verbosity level |
| ROCPROFSYS_WIDTH | Set the global output width for comp... |
|-----------------------------------------|-----------------------------------------|
|------------------------------------------|-----------------------------------------|
Viewing components
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+3 -13
ファイルの表示
@@ -268,8 +268,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
$ rocprof-sys-sample -- ./parallel-overhead-locks 30 4 100
HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
HSA_TOOLS_REPORT_LOAD_FAILURE=1
LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
ROCPROFSYS_USE_PROCESS_SAMPLING=false
ROCPROFSYS_USE_SAMPLING=true
@@ -283,8 +281,6 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
$ rocprof-sys-sample -PTDH -I all -- ./parallel-overhead-locks 30 4 100
HSA_TOOLS_LIB=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
HSA_TOOLS_REPORT_LOAD_FAILURE=1
KOKKOS_PROFILE_LIBRARY=/opt/rocprofiler-systems/lib/librocprof-sys.so.1.7.1
LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
ROCPROFSYS_CPU_FREQ_ENABLED=true
@@ -298,9 +294,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
ROCPROFSYS_USE_PROCESS_SAMPLING=true
ROCPROFSYS_USE_RCCLP=true
ROCPROFSYS_USE_ROCM_SMI=true
ROCPROFSYS_USE_ROCPROFILER=true
ROCPROFSYS_USE_ROCTRACER=true
ROCPROFSYS_USE_ROCTX=true
ROCPROFSYS_USE_ROCM=true
ROCPROFSYS_USE_SAMPLING=true
ROCPROFSYS_PROFILE=true
OMP_TOOL_LIBRARIES=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
@@ -330,9 +324,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
ROCPROFSYS_USE_PROCESS_SAMPLING=true
ROCPROFSYS_USE_RCCLP=false
ROCPROFSYS_USE_ROCM_SMI=false
ROCPROFSYS_USE_ROCPROFILER=false
ROCPROFSYS_USE_ROCTRACER=false
ROCPROFSYS_USE_ROCTX=false
ROCPROFSYS_USE_ROCM=false
ROCPROFSYS_USE_SAMPLING=true
ROCPROFSYS_PROFILE=true
...
@@ -363,9 +355,7 @@ Here is the full output from the previous
ROCPROFSYS_USE_PROCESS_SAMPLING=true
ROCPROFSYS_USE_RCCLP=false
ROCPROFSYS_USE_ROCM_SMI=false
ROCPROFSYS_USE_ROCPROFILER=false
ROCPROFSYS_USE_ROCTRACER=false
ROCPROFSYS_USE_ROCTX=false
ROCPROFSYS_USE_ROCM=false
ROCPROFSYS_USE_SAMPLING=true
[rocprof-sys][dl][1785877] rocprofsys_main
[rocprof-sys][1785877][rocprofsys_init_tooling] Instrumentation mode: Sampling
+15 -17
ファイルの表示
@@ -241,8 +241,8 @@ Installing ROCm Systems Profiler
-----------------------------------
ROCm Systems Profiler has CMake configuration options for MPI support (``ROCPROFSYS_USE_MPI`` or
``ROCPROFSYS_USE_MPI_HEADERS``), HIP kernel tracing (``ROCPROFSYS_USE_ROCTRACER``),
ROCm device sampling (``ROCPROFSYS_USE_ROCM_SMI``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``),
``ROCPROFSYS_USE_MPI_HEADERS``),
ROCm tracing and sampling (``ROCPROFSYS_USE_ROCM``), OpenMP-Tools (``ROCPROFSYS_USE_OMPT``),
hardware counters via PAPI (``ROCPROFSYS_USE_PAPI``), among other features.
Various additional features can be enabled via the
``TIMEMORY_USE_*`` `CMake options <https://timemory.readthedocs.io/en/develop/installation.html#cmake-options>`_.
@@ -256,22 +256,20 @@ in `the Perfetto UI <https://ui.perfetto.dev>`_.
.. code-block:: shell
git clone https://github.com/ROCm/rocprofiler-systems.git rocprof-sys-source
cmake \
-B rocprof-sys-build \
cmake \
-B rocprof-sys-build \
-D CMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems \
-D ROCPROFSYS_USE_HIP=ON \
-D ROCPROFSYS_USE_ROCM_SMI=ON \
-D ROCPROFSYS_USE_ROCTRACER=ON \
-D ROCPROFSYS_USE_PYTHON=ON \
-D ROCPROFSYS_USE_OMPT=ON \
-D ROCPROFSYS_USE_MPI_HEADERS=ON \
-D ROCPROFSYS_BUILD_PAPI=ON \
-D ROCPROFSYS_BUILD_LIBUNWIND=ON \
-D ROCPROFSYS_BUILD_DYNINST=ON \
-D DYNINST_BUILD_TBB=ON \
-D DYNINST_BUILD_BOOST=ON \
-D DYNINST_BUILD_ELFUTILS=ON \
-D DYNINST_BUILD_LIBIBERTY=ON \
-D ROCPROFSYS_USE_ROCM=ON \
-D ROCPROFSYS_USE_PYTHON=ON \
-D ROCPROFSYS_USE_OMPT=ON \
-D ROCPROFSYS_USE_MPI_HEADERS=ON \
-D ROCPROFSYS_BUILD_PAPI=ON \
-D ROCPROFSYS_BUILD_LIBUNWIND=ON \
-D ROCPROFSYS_BUILD_DYNINST=ON \
-D DYNINST_BUILD_TBB=ON \
-D DYNINST_BUILD_BOOST=ON \
-D DYNINST_BUILD_ELFUTILS=ON \
-D DYNINST_BUILD_LIBIBERTY=ON \
rocprof-sys-source
cmake --build rocprof-sys-build --target all --parallel 8
cmake --build rocprof-sys-build --target install
+4 -4
ファイルの表示
@@ -372,7 +372,7 @@ if [ "${IS_DOCKER}" -ne 0 ]; then git config --global --add safe.directory ${PWD
verbose-run echo "Build rocprofiler-systems installers with generators: ${GENERATORS}"
build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=OFF
build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=OFF -DROCPROFSYS_USE_MPI=ON
build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=OFF
build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_HIP=ON -DROCPROFSYS_USE_MPI=ON
build-and-package ${WITH_CORE} ${DISTRO}-core -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=OFF
build-and-package ${WITH_MPI} ${DISTRO}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_MPI=ON
build-and-package ${WITH_ROCM} ${DISTRO}-rocm-${ROCM_VERSION} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=OFF
build-and-package ${WITH_ROCM_MPI} ${DISTRO}-rocm-${ROCM_VERSION}-${MPI_IMPL} -DROCPROFSYS_USE_ROCM=ON -DROCPROFSYS_USE_MPI=ON
+3 -12
ファイルの表示
@@ -1,17 +1,8 @@
# executable RPATH
if(ROCPROFSYS_USE_ROCPROFILER
AND rocprofiler_LIBRARY_DIR
AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0
AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH)
set(ROCPROFSYS_EXE_INSTALL_RPATH
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}"
)
else()
set(ROCPROFSYS_EXE_INSTALL_RPATH
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}"
)
endif()
set(ROCPROFSYS_EXE_INSTALL_RPATH
"\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}"
)
# executables
add_subdirectory(rocprof-sys-avail)
+11 -10
ファイルの表示
@@ -33,8 +33,7 @@
#include "api.hpp"
#include "core/config.hpp"
#include "core/gpu.hpp"
#include "core/hip_runtime.hpp"
#include "library/rocprofiler.hpp"
#include "library/rocm.hpp"
#include <timemory/components.hpp>
#include <timemory/components/definition.hpp>
@@ -119,7 +118,7 @@ write_hw_counter_info(std::ostream&, const array_t<bool, N>& = {},
namespace
{
// initialize HIP before main so that librocprof-sys is not HSA_TOOLS_LIB
int gpu_count = rocprofsys::gpu::hip_device_count();
int gpu_count = rocprofsys::gpu::device_count();
// statically allocated shared_ptrs to prevent use after free errors
auto timemory_manager = tim::manager::master_instance();
@@ -508,15 +507,15 @@ main(int argc, char** argv)
return EXIT_FAILURE;
}
#if ROCPROFSYS_USE_HIP > 0
#if ROCPROFSYS_USE_ROCM > 0
if(gpu_count > 0)
{
size_t _num_metrics = 0;
try
{
// call to rocm_metrics() will add choices to ROCPROFSYS_ROCM_EVENTS setting
// call to rocm_events() will add choices to ROCPROFSYS_ROCM_EVENTS setting
// so always perform this call even if list of HW counters is not requested
_num_metrics = rocprofsys::rocprofiler::rocm_metrics().size();
_num_metrics = rocprofsys::rocm::rocm_events().size();
} catch(std::runtime_error& _e)
{
verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what());
@@ -615,9 +614,9 @@ main(int argc, char** argv)
}
}
signal(SIGABRT, &dump_log_abort);
signal(SIGSEGV, &dump_log_abort);
signal(SIGQUIT, &dump_log_abort);
// signal(SIGABRT, &dump_log_abort);
// signal(SIGSEGV, &dump_log_abort);
// signal(SIGQUIT, &dump_log_abort);
if(!os) os = &std::cout;
@@ -641,6 +640,8 @@ main(int argc, char** argv)
}
dump_log();
const_cast<std::shared_ptr<tim::settings>&>(tim::settings::shared_instance()).reset();
return 0;
}
@@ -1076,7 +1077,7 @@ write_hw_counter_info(std::ostream& os, const array_t<bool, N>& options,
auto _papi_events = tim::papi::available_events_info();
auto _rocm_events =
(gpu_count > 0) ? rocprofsys::rocprofiler::rocm_metrics() : hwcounter_info_t{};
(gpu_count > 0) ? rocprofsys::rocm::rocm_events() : hwcounter_info_t{};
if(alphabetical)
{
+1 -1
ファイルの表示
@@ -339,7 +339,7 @@ generate_config(std::string _config_file, const std::set<std::string>& _config_f
for(const auto* itr :
{ "ROCPROFSYS_CONFIG", "ROCPROFSYS_MODE", "ROCPROFSYS_TRACE",
"ROCPROFSYS_PROFILE", "ROCPROFSYS_USE_SAMPLING",
"ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCTRACER",
"ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCM",
"ROCPROFSYS_USE_ROCM_SMI", "ROCPROFSYS_USE_KOKKOSP",
"ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", "ROCPROFSYS_OUTPUT" })
{
-2
ファイルの表示
@@ -29,8 +29,6 @@
#include "library/components/fork_gotcha.hpp"
#include "library/components/mpi_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/rocprofiler.hpp"
#include "library/components/roctracer.hpp"
#include <timemory/components/definition.hpp>
#include <timemory/enum.h>
-9
ファイルの表示
@@ -752,10 +752,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
parser.end_group();
#if ROCPROFSYS_HIP_VERSION > 0 && ROCPROFSYS_HIP_VERSION < 50300
update_env(_env, "HSA_ENABLE_INTERRUPT", 0);
#endif
auto _inpv = std::vector<char*>{};
auto _outv = std::vector<char*>{};
bool _hash = false;
@@ -824,11 +820,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
add_default_env(_env, "ROCPROFSYS_USE_MPIP", true);
#endif
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
add_default_env(_env, "ROCPROFSYS_ROCTRACER_HIP_API", true);
add_default_env(_env, "ROCPROFSYS_ROCTRACER_HSA_API", true);
#endif
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true);
#endif
+2
ファイルの表示
@@ -35,6 +35,8 @@ target_link_libraries(
timemory::timemory-extensions
timemory::timemory-core)
add_target_flag_if_avail(rocprofiler-systems-instrument "-Wno-deprecated-declarations")
set_target_properties(
rocprofiler-systems-instrument
PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
+19 -7
ファイルの表示
@@ -312,13 +312,25 @@ get_internal_basic_libs_impl()
"liblzma.so" };
// shared libraries used by rocprof-sys
const auto _omni_libs = strview_init_t{
"libstdc++.so.6", "libgotcha.so", "libunwind-coredump.so",
"libunwind-generic.so", "libunwind-ptrace.so", "libunwind-setjmp.so",
"libunwind.so", "libunwind-x86_64.so", "librocm_smi64.so",
"libroctx64.so", "librocmtools.so", "libroctracer64.so",
"librocprofiler64.so", "libpapi.so", "libpfm.so"
};
const auto _omni_libs = strview_init_t{ "libstdc++.so.6",
"libgotcha.so",
"libunwind-coredump.so",
"libunwind-generic.so",
"libunwind-ptrace.so",
"libunwind-setjmp.so",
"libunwind.so",
"libunwind-x86_64.so",
"librocm_smi64.so",
"libroctx64.so",
"librocmtools.so",
"libroctracer64.so",
"librocprofiler64.so",
"libpapi.so",
"libpfm.so",
"librocprofiler-register.so",
"librocprofiler-sdk.so",
"librocprofiler-sdk-roctx.so",
"libamd_smi.so" };
// shared libraries potentially used by timemory
const auto _3rdparty_libs = strview_init_t{ "libcaliper.so",
@@ -357,10 +357,12 @@ main(int argc, char** argv)
itr.find("rocprof-sys") != std::string::npos ||
itr.find("rocprofiler-systems") != std::string::npos ||
std::regex_search(
itr, std::regex{ "lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|"
"instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|"
"tbbmalloc|tbbmalloc_proxy|gotcha|libunwind|roctracer|"
"hsa-runtime|amdhip|rocm_smi)\\.(so|a)" }))
itr, std::regex{
"lib(dyninstAPI|stackwalk|pcontrol|patchAPI|parseAPI|"
"instructionAPI|symtabAPI|dynDwarf|common|dynElf|tbb|tbbmalloc|"
"tbbmalloc_proxy|gotcha|libunwind|roctracer64|hsa-runtime|amdhip|"
"amd_comgr|rocm_smi64|rocprofiler64|rocprofiler-register|"
"rocprofiler-sdk|rocprofiler-sdk-roctx|amd_smi)\\.(so|a)" }))
{
if(!find(filepath::dirname(itr), lib_search_paths))
lib_search_paths.emplace_back(filepath::dirname(itr));
+24 -72
ファイルの表示
@@ -44,14 +44,6 @@
#include <unistd.h>
#include <vector>
#if !defined(ROCPROFSYS_USE_ROCTRACER)
# define ROCPROFSYS_USE_ROCTRACER 0
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
# define ROCPROFSYS_USE_ROCPROFILER 0
#endif
namespace color = tim::log::color;
using namespace timemory::join;
using tim::get_env;
@@ -140,17 +132,6 @@ get_initial_environment()
update_env(_env, "ROCPROFSYS_USE_SAMPLING", (_mode != "causal"));
#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER)
update_env(_env, "HSA_TOOLS_LIB", _dl_libpath);
if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE"))
update_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1");
#endif
#if defined(ROCPROFSYS_USE_ROCPROFILER)
update_env(_env, "ROCP_TOOL_LIB", _omni_libpath);
if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_env, "ROCP_HSA_INTERCEPT", "1");
#endif
#if defined(ROCPROFSYS_USE_OMPT)
if(!getenv("OMP_TOOL_LIBRARIES"))
update_env(_env, "OMP_TOOL_LIBRARIES", _dl_libpath, UPD_APPEND);
@@ -357,14 +338,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
%{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance
%{INDENT}% 1 do not modify how ROCm is notified about kernel completion)";
auto _realtime_reqs = (get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty())
? std::vector<std::string>{ "hsa-interrupt" }
: std::vector<std::string>{};
#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0
_realtime_reqs.clear();
#endif
const auto* _trace_policy_desc =
R"(Policy for new data when the buffer size limit is reached:
%{INDENT}%- discard : new data is ignored
@@ -720,7 +693,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
parser.add_argument({ "--realtime" }, _realtime_desc)
.min_count(0)
.required(std::move(_realtime_reqs))
.action([&](parser_t& p) {
auto _v = p.get<std::deque<std::string>>("realtime");
update_env(_env, "ROCPROFSYS_SAMPLING_REALTIME", true);
@@ -741,10 +713,20 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
}
});
std::set<std::string> _backend_choices = { "all", "kokkosp", "mpip",
"ompt", "rcclp", "rocm-smi",
"roctracer", "rocprofiler", "roctx",
"mutex-locks", "spin-locks", "rw-locks" };
std::set<std::string> _backend_choices = { "all",
"kokkosp",
"mpip",
"ompt",
"rcclp",
"rocm-smi",
"roctracer",
"rocprofiler",
"roctx",
"mutex-locks",
"spin-locks",
"rw-locks",
"rocprofiler-sdk",
"rocm" };
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
_backend_choices.erase("mpip");
@@ -758,17 +740,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
_backend_choices.erase("rcclp");
#endif
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
#if !defined(ROCPROFSYS_USE_ROCM)
_backend_choices.erase("rocm");
_backend_choices.erase("rocm-smi");
#endif
#if !defined(ROCPROFSYS_USE_ROCTRACER)
_backend_choices.erase("roctracer");
_backend_choices.erase("roctx");
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
_backend_choices.erase("rocprofiler");
_backend_choices.erase("rocprofiler-sdk");
#endif
parser.start_group("BACKEND OPTIONS",
@@ -784,11 +759,9 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
@@ -810,27 +783,18 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
if(_v.count("all") > 0 ||
(_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0))
{
remove_env(_env, "HSA_TOOLS_LIB");
remove_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE");
}
if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
{
remove_env(_env, "ROCP_TOOL_LIB");
remove_env(_env, "ROCP_HSA_INTERCEPT");
}
// if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
// {
// remove_env(_env, "ROCP_TOOL_LIB");
// remove_env(_env, "ROCP_HSA_INTERCEPT");
// }
if(_v.count("all") > 0 || _v.count("ompt") > 0)
remove_env(_env, "OMP_TOOL_LIBRARIES");
@@ -850,18 +814,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
update_env(_env, "ROCPROFSYS_PAPI_EVENTS", _events);
});
#if defined(ROCPROFSYS_USE_ROCPROFILER)
parser
.add_argument({ "-G", "--gpu-events" },
"Set the GPU hardware counter events to record (ref: "
"`rocprof-sys-avail -H -c GPU`)")
.action([&](parser_t& p) {
auto _events =
join(array_config{ "," }, p.get<std::vector<std::string>>("gpu-events"));
update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events);
});
#endif
parser.start_group("MISCELLANEOUS OPTIONS", "");
parser
.add_argument({ "-i", "--inlines" },
+2 -13
ファイルの表示
@@ -12,15 +12,7 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20)
cmake_policy(SET CMP0115 NEW)
endif()
if(ROCPROFSYS_USE_ROCPROFILER
AND rocprofiler_LIBRARY_DIR
AND ROCmVersion_TRIPLE_VERSION VERSION_LESS 5.2.0
AND NOT CMAKE_INSTALL_RPATH_USE_LINK_PATH)
set(ROCPROFSYS_LIB_INSTALL_RPATH
"\$ORIGIN:\$ORIGIN/${PROJECT_NAME}:${rocprofiler_LIBRARY_DIR}")
else()
set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}")
endif()
set(ROCPROFSYS_LIB_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/${PROJECT_NAME}")
# ------------------------------------------------------------------------------#
#
@@ -50,10 +42,7 @@ target_link_libraries(
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-bfd>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-mpi>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-ptl>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-hip>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-roctracer>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocprofiler>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm-smi>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rccl>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
+3 -1
ファイルの表示
@@ -19,7 +19,9 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/environment.hpp
${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp
${CMAKE_CURRENT_SOURCE_DIR}/join.hpp
${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp)
${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp
${CMAKE_CURRENT_SOURCE_DIR}/static_object.hpp
${CMAKE_CURRENT_SOURCE_DIR}/synchronized.hpp)
get_filename_component(COMMON_SOURCE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" DIRECTORY)
get_filename_component(COMMON_BINARY_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}" DIRECTORY)
+12 -12
ファイルの表示
@@ -42,10 +42,10 @@
#define ROCPROFSYS_COMPILER_STRING ROCPROFSYS_COMPILER_ID " v" ROCPROFSYS_COMPILER_VERSION
#define ROCPROFSYS_DEFAULT_ROCM_PATH "@ROCmVersion_DIR@"
#define ROCPROFSYS_HIP_VERSION_STRING "@ROCPROFSYS_HIP_VERSION@"
#define ROCPROFSYS_HIP_VERSION_MAJOR @ROCPROFSYS_HIP_VERSION_MAJOR@
#define ROCPROFSYS_HIP_VERSION_MINOR @ROCPROFSYS_HIP_VERSION_MINOR@
#define ROCPROFSYS_HIP_VERSION_PATCH @ROCPROFSYS_HIP_VERSION_PATCH@
#define ROCPROFSYS_ROCM_VERSION_STRING "@ROCPROFSYS_ROCM_VERSION@"
#define ROCPROFSYS_ROCM_VERSION_MAJOR @ROCPROFSYS_ROCM_VERSION_MAJOR@
#define ROCPROFSYS_ROCM_VERSION_MINOR @ROCPROFSYS_ROCM_VERSION_MINOR@
#define ROCPROFSYS_ROCM_VERSION_PATCH @ROCPROFSYS_ROCM_VERSION_PATCH@
// these can be set via defining the variable in CMake, e.g.:
// cmake -D ROCPROFSYS_CACHELINE_SIZE=N /path/to/source
@@ -63,15 +63,15 @@
((10000 * ROCPROFSYS_VERSION_MAJOR) + (100 * ROCPROFSYS_VERSION_MINOR) + \
ROCPROFSYS_VERSION_PATCH)
#define ROCPROFSYS_HIP_VERSION \
((10000 * ROCPROFSYS_HIP_VERSION_MAJOR) + (100 * ROCPROFSYS_HIP_VERSION_MINOR) + \
ROCPROFSYS_HIP_VERSION_PATCH)
#define ROCPROFSYS_ROCM_VERSION \
((10000 * ROCPROFSYS_ROCM_VERSION_MAJOR) + (100 * ROCPROFSYS_ROCM_VERSION_MINOR) + \
ROCPROFSYS_ROCM_VERSION_PATCH)
#if ROCPROFSYS_HIP_VERSION_MAJOR > 0
# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING \
"v@ROCPROFSYS_HIP_VERSION_MAJOR@.@ROCPROFSYS_HIP_VERSION_MINOR@.x"
#if ROCPROFSYS_ROCM_VERSION_MAJOR > 0
# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \
"v@ROCPROFSYS_ROCM_VERSION_MAJOR@.@ROCPROFSYS_ROCM_VERSION_MINOR@.x"
#else
# define ROCPROFSYS_HIP_VERSION_COMPAT_STRING ""
# define ROCPROFSYS_ROCM_VERSION_COMPAT_STRING ""
#endif
// this should be passed to argparse::argument_parser::enable_version
@@ -83,7 +83,7 @@
{ \
{ "", ROCPROFSYS_LIBRARY_ARCH }, { "compiler", ROCPROFSYS_COMPILER_STRING }, \
{ \
"rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING \
"rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING \
} \
}
#endif
-142
ファイルの表示
@@ -109,148 +109,6 @@ get_environ(int _verbose, std::string _search_paths = {},
_omnilib = common::path::find_path(_omnilib, _verbose, _search_paths);
_omnilib_dl = common::path::find_path(_omnilib_dl, _verbose, _search_paths);
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
_data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 });
#endif
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
# if ROCPROFSYS_HIP_VERSION >= 50200
# define ROCPROFILER_METRICS_DIR "lib/rocprofiler"
# else
# define ROCPROFILER_METRICS_DIR "rocprofiler/lib"
# endif
# if ROCPROFSYS_HIP_VERSION <= 50500
# define ROCPROFILER_LIBNAME "librocprofiler64.so"
# else
# define ROCPROFILER_LIBNAME "librocprofiler64.so.1"
# endif
_data.emplace_back(env_config{ "HSA_TOOLS_LIB", _omnilib.c_str(), 0 });
_data.emplace_back(env_config{ "ROCP_TOOL_LIB", _omnilib.c_str(), 0 });
_data.emplace_back(env_config{ "ROCPROFILER_LOG", "1", 0 });
_data.emplace_back(env_config{ "ROCP_HSA_INTERCEPT", "1", 0 });
_data.emplace_back(env_config{ "HSA_TOOLS_REPORT_LOAD_FAILURE", "1", 0 });
auto _possible_rocp_metrics = std::vector<std::string>{};
auto _possible_rocprof_libs = std::vector<std::string>{};
for(const auto* itr : { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" })
{
if(getenv(itr))
{
_possible_rocp_metrics.emplace_back(
common::join('/', getenv(itr), "lib/rocprofiler"));
_possible_rocprof_libs.emplace_back(
common::join('/', getenv(itr), "lib/rocprofiler", ROCPROFILER_LIBNAME));
_possible_rocp_metrics.emplace_back(
common::join('/', getenv(itr), "rocprofiler/lib"));
_possible_rocprof_libs.emplace_back(
common::join('/', getenv(itr), "rocprofiler/lib", ROCPROFILER_LIBNAME));
}
}
// default path
_possible_rocp_metrics.emplace_back(
common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "lib/rocprofiler"));
_possible_rocp_metrics.emplace_back(
common::join('/', ROCPROFSYS_DEFAULT_ROCM_PATH, "rocprofiler/lib"));
auto _realpath_and_unique = [](const auto& _inp_v) {
auto _out_v = decltype(_inp_v){};
for(auto& itr : _inp_v)
{
if(path::exists(itr)) _out_v.emplace_back(path::realpath(itr));
}
_out_v.erase(std::unique(_out_v.begin(), _out_v.end()), _out_v.end());
return _out_v;
};
_possible_rocprof_libs = _realpath_and_unique(_possible_rocprof_libs);
for(const auto& itr : _possible_rocprof_libs)
{
if(path::exists(itr))
{
_data.emplace_back(
env_config{ "ROCPROFSYS_ROCPROFILER_LIBRARY", itr.c_str(), 0 });
_possible_rocp_metrics.emplace(
_possible_rocp_metrics.begin(),
common::join('/', path::dirname(itr), "../../lib/rocprofiler"));
_possible_rocp_metrics.emplace(_possible_rocp_metrics.begin(),
common::join('/', path::dirname(itr)));
}
}
_possible_rocp_metrics = _realpath_and_unique(_possible_rocp_metrics);
auto _env_rocp_metrics = get_env("ROCP_METRICS", "");
if(!_env_rocp_metrics.empty())
{
if(!path::exists(_env_rocp_metrics))
throw std::runtime_error(join("", "Error! ROCP_METRICS file \"",
_env_rocp_metrics, "\" does not exist"));
_possible_rocp_metrics.clear();
_possible_rocp_metrics.emplace_back(
common::join('/', path::dirname(_env_rocp_metrics)));
}
auto _found_rocp_metrics = (!_env_rocp_metrics.empty())
? get_env("ROCPROFSYS_ROCP_METRICS_FORCE_VALID", false)
: false;
if(!_found_rocp_metrics)
{
for(const auto& itr : _possible_rocp_metrics)
{
auto _metrics_path = join('/', itr, "metrics.xml");
if(path::exists(itr) && path::exists(_metrics_path) &&
path::exists(join('/', itr, "gfx_metrics.xml")))
{
_found_rocp_metrics = true;
_data.emplace_back(
env_config{ "ROCP_METRICS", _metrics_path.c_str(), 0 });
break;
}
}
}
// handle error
if(!_found_rocp_metrics)
{
auto _msg = std::stringstream{};
_msg << std::boolalpha;
if(!_env_rocp_metrics.empty())
{
auto _env_rocp_metrics_dir = path::dirname(_env_rocp_metrics);
auto _rocp_metrics_xml = join('/', _env_rocp_metrics_dir, "metrics.xml");
auto _rocp_gfx_metrics_xml =
join('/', _env_rocp_metrics_dir, "gfx_metrics.xml");
_msg << "Error! ROCP_METRICS=\"" << _env_rocp_metrics
<< "\" in the environment but the directory (" << _env_rocp_metrics_dir
<< ") does not contain "
"metrics.xml (found: "
<< path::exists(_rocp_metrics_xml) << ") and/or gfx_metrics.xml (found: "
<< path::exists(_rocp_gfx_metrics_xml)
<< "). To ignore this error, set "
"ROCPROFSYS_ROCP_METRICS_FORCE_VALID=true in the environment";
}
else
{
_msg
<< "Error! ROCP_METRICS not set in environment and rocprof-sys could not "
"find a suitable path. Please set ROCP_METRICS=/path/to/metrics.xml "
"in the environment. This file is typically located in the same "
"folder as the librocprofiler64.so library.\nAdditional note: "
"metrics.xml typically contains:\n\t#include "
"\"gfx_metrics.xml\"\nMake sure the provided path also contains this "
"file.\nExample:\n\texport ROCP_METRICS="
<< ROCPROFSYS_DEFAULT_ROCM_PATH << "/" << ROCPROFILER_METRICS_DIR
<< "/metrics.xml\n";
}
throw std::runtime_error(_msg.str());
}
#endif
#if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0
if(get_env("ROCPROFSYS_USE_OMPT", true))
{
+207
ファイルの表示
@@ -0,0 +1,207 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <array>
#include <cstddef>
#include <deque>
#include <functional>
#include <iostream>
#include <mutex>
#include <stack>
namespace rocprofsys
{
inline namespace common
{
using static_dtor_func_t = void (*)();
void
destroy_static_objects();
void
register_static_dtor(static_dtor_func_t&&);
namespace
{
struct anonymous
{};
} // namespace
struct do_not_destroy
{};
template <typename Tp>
constexpr size_t
static_buffer_size()
{
return sizeof(Tp);
}
/**
* @brief This struct is used to create static singleton objects which have the properties
* of a heap-allocated static object without a memory leak.
*
* @tparam Tp Data type of singleton
* @tparam ContextT Use to differentiate singletons in different translation units (if
* using default parameter) or ensure the singleton can be accessed in different
* translation units (not recommended) as long as this type is not in an anonymous
* namespace
*
* This template works by creating a buffer of at least `sizeof(Tp)` bytes in the binary
* and does a placement new into that buffer. The object created is NOT heap allocated,
* the address of the object is an address in between the library load address and the
* load address + size of library.
*/
template <typename Tp, typename ContextT = anonymous>
struct static_object
{
static_object() = delete;
~static_object() = delete;
static_object(const static_object&) = delete;
static_object(static_object&&) noexcept = delete;
static_object& operator=(const static_object&) = delete;
static_object& operator=(static_object&&) noexcept = delete;
template <typename... Args>
static Tp*& construct(Args&&... args);
template <typename... Args>
static Tp*& construct(do_not_destroy&&, Args&&... args);
static Tp* get() { return m_object; }
static constexpr bool is_trivial_standard_layout();
private:
static Tp* m_object;
static std::array<std::byte, static_buffer_size<Tp>()> m_buffer;
};
template <typename Tp, typename ContextT>
Tp* static_object<Tp, ContextT>::m_object = nullptr;
template <typename Tp, typename ContextT>
std::array<std::byte, static_buffer_size<Tp>()>
static_object<Tp, ContextT>::m_buffer = {};
template <typename Tp, typename ContextT>
constexpr bool
static_object<Tp, ContextT>::is_trivial_standard_layout()
{
return (std::is_standard_layout<Tp>::value && std::is_trivial<Tp>::value);
}
template <typename Tp, typename ContextT>
template <typename... Args>
Tp*&
static_object<Tp, ContextT>::construct(Args&&... args)
{
if constexpr(!is_trivial_standard_layout())
{
static auto _once = std::once_flag{};
std::call_once(_once, []() {
register_static_dtor([]() {
if(static_object<Tp, ContextT>::m_object)
{
static_object<Tp, ContextT>::m_object->~Tp();
static_object<Tp, ContextT>::m_object = nullptr;
}
});
});
}
if(m_object)
{
std::cerr
<< "reconstructing static object. Use get() function to retrieve pointer"
<< std::endl;
abort();
}
m_object = new(m_buffer.data()) Tp{ std::forward<Args>(args)... };
return m_object;
}
template <typename Tp, typename ContextT>
template <typename... Args>
Tp*&
static_object<Tp, ContextT>::construct(do_not_destroy&&, Args&&... args)
{
if(m_object)
{
std::cerr
<< "reconstructing static object. Use get() function to retrieve pointer"
<< std::endl;
abort();
}
m_object = new(m_buffer.data()) Tp{ std::forward<Args>(args)... };
return m_object;
}
namespace
{
inline auto*&
get_static_object_stack()
{
static auto* _v = new std::stack<static_dtor_func_t>{};
return _v;
}
} // namespace
inline void
destroy_static_objects()
{
static auto _sync = std::mutex{};
auto _lk = std::unique_lock<std::mutex>{ _sync };
auto*& _stack = get_static_object_stack();
if(_stack)
{
while(!_stack->empty())
{
auto& itr = _stack->top();
if(itr) itr();
_stack->pop();
}
delete _stack;
_stack = nullptr;
}
}
inline void
register_static_dtor(static_dtor_func_t&& _func)
{
static auto _sync = std::mutex{};
auto _lk = std::unique_lock<std::mutex>{ _sync };
auto*& _stack = get_static_object_stack();
if(_stack)
{
_stack->push(_func);
}
}
} // namespace common
} // namespace rocprofsys
+167
ファイルの表示
@@ -0,0 +1,167 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <cstddef>
#include <functional>
#include <mutex>
#include <shared_mutex>
#include <type_traits>
namespace rocprofsys
{
inline namespace common
{
/**
* Sychronized is a wrapper that adds lock based write/read
* protection around a datatype. The protected data is accessed
* only by rlock/wlock. rlock(lambda) gets a reader lock of the
* protected value, passing the protected value to the lambda as a
* const. wlock(lambda) gets a writer lock on the protective value
* and does the same. The reason for this class is to make it less
* error prone to access shared data and more obvious when a lock
* is being held.
*
* Example usage:
*
* synchronized<int> x(9);
* x.rlock([](const auto& data){
* // data = 9
* });
*
* x.wlock([](auto& data){
* // set data to new value
* });
*/
template <typename LockedType, bool IsMappedTypeV = false>
class synchronized
{
public:
using value_type = LockedType;
using this_type = synchronized<value_type, IsMappedTypeV>;
synchronized() = default;
~synchronized() = default;
explicit synchronized(value_type&& data)
: m_data{ std::move(data) }
{}
synchronized(synchronized&& data) noexcept = default;
synchronized& operator=(synchronized&& data) noexcept = default;
// Do not allow this data structure to be copied, std::move only.
synchronized(const synchronized&) = delete;
template <typename FuncT, typename... Args>
decltype(auto) rlock(FuncT&& lambda, Args&&... args) const;
template <typename FuncT, typename... Args>
decltype(auto) wlock(FuncT&& lambda, Args&&... args);
// This overload to wlock allows a synchronized map whose keys map to synchronized
// data to use a read lock on the key data and then a write lock on the mapped data.
template <typename FuncT, typename... Args, bool EnableForMappedType = IsMappedTypeV,
std::enable_if_t<EnableForMappedType, int> = 0>
decltype(auto) wlock(FuncT&& lambda, Args&&... args) const;
// Upgradable lock. If read returns false, write will be called with a unique_lock.
// Essentially a helper function that does .rlock() followed by .wlock().
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args);
private:
mutable std::shared_mutex m_mutex = {};
value_type m_data = {};
};
//
// member definitions
//
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args>
decltype(auto)
synchronized<LockedType, IsMappedTypeV>::rlock(FuncT&& lambda, Args&&... args) const
{
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
"function must accept const reference to locked type");
auto lock = std::shared_lock{ m_mutex };
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args>
decltype(auto)
synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args)
{
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
"function must accept reference to locked type");
auto lock = std::unique_lock{ m_mutex };
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
// use a read lock on the key data and then a write lock on the mapped data.
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args, bool EnableForMappedType,
std::enable_if_t<EnableForMappedType, int>>
decltype(auto)
synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args) const
{
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
std::forward<Args>(args)...);
}
// Upgradable lock. If read returns false, write will be called with a unique_lock.
// Essentially a helper function that does .rlock() followed by .wlock().
template <typename LockedType, bool IsMappedTypeV>
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
bool
synchronized<LockedType, IsMappedTypeV>::ulock(ReadFuncT&& read, WriteFuncT&& write,
Args&&... args)
{
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
"read function must accept const reference to locked type");
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
"write function must accept reference to locked type");
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
static_assert(std::is_same<read_return_type, write_return_type>::value,
"read and write functions must return same type");
static_assert(std::is_same<read_return_type, bool>::value,
"read/write functions must return bool");
{
auto lock = std::shared_lock{ m_mutex };
if(read(m_data, std::forward<Args>(args)...)) return true;
}
auto lock = std::unique_lock{ m_mutex };
return write(m_data, std::forward<Args>(args)...);
}
} // namespace common
} // namespace rocprofsys
+7 -3
ファイルの表示
@@ -14,6 +14,7 @@ set(core_sources
${CMAKE_CURRENT_LIST_DIR}/mproc.cpp
${CMAKE_CURRENT_LIST_DIR}/perf.cpp
${CMAKE_CURRENT_LIST_DIR}/perfetto.cpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp
${CMAKE_CURRENT_LIST_DIR}/state.cpp
${CMAKE_CURRENT_LIST_DIR}/timemory.cpp
${CMAKE_CURRENT_LIST_DIR}/utility.cpp)
@@ -29,13 +30,13 @@ set(core_headers
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp
${CMAKE_CURRENT_LIST_DIR}/exception.hpp
${CMAKE_CURRENT_LIST_DIR}/gpu.hpp
${CMAKE_CURRENT_LIST_DIR}/hip_runtime.hpp
${CMAKE_CURRENT_LIST_DIR}/locking.hpp
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp
${CMAKE_CURRENT_LIST_DIR}/redirect.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
${CMAKE_CURRENT_LIST_DIR}/state.hpp
${CMAKE_CURRENT_LIST_DIR}/timemory.hpp
${CMAKE_CURRENT_LIST_DIR}/utility.hpp)
@@ -54,6 +55,10 @@ add_subdirectory(containers)
target_include_directories(rocprofiler-systems-core-library BEFORE
PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_include_directories(
rocprofiler-systems-core-library
PRIVATE ${PROJECT_SOURCE_DIR}/external/timemory/source/timemory/tpls/cereal)
target_link_libraries(rocprofiler-systems-core-library
PRIVATE rocprofiler-systems::rocprofiler-systems-interface-library)
target_link_libraries(
@@ -67,8 +72,7 @@ target_link_libraries(
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-perfetto>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-timemory>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-mpi>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-hip>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm-smi>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-sanitizer>
+12 -85
ファイルの表示
@@ -222,17 +222,6 @@ init_parser(parser_data& _data)
_data.dl_libpath = get_realpath(get_internal_libpath("librocprof-sys-dl.so").c_str());
_data.omni_libpath = get_realpath(get_internal_libpath("librocprof-sys.so").c_str());
#if defined(ROCPROFSYS_USE_ROCTRACER) || defined(ROCPROFSYS_USE_ROCPROFILER)
update_env(_data, "HSA_TOOLS_LIB", _data.dl_libpath);
if(!getenv("HSA_TOOLS_REPORT_LOAD_FAILURE"))
update_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1");
#endif
#if defined(ROCPROFSYS_USE_ROCPROFILER)
update_env(_data, "ROCP_TOOL_LIB", _data.omni_libpath);
if(!getenv("ROCP_HSA_INTERCEPT")) update_env(_data, "ROCP_HSA_INTERCEPT", "1");
#endif
#if defined(ROCPROFSYS_USE_OMPT)
if(!getenv("OMP_TOOL_LIBRARIES"))
update_env(_data, "OMP_TOOL_LIBRARIES", _data.dl_libpath, UPD_PREPEND);
@@ -300,15 +289,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
%{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance
%{INDENT}% 1 do not modify how ROCm is notified about kernel completion)";
auto _realtime_reqs =
(tim::get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty())
? strvec_t{ "hsa-interrupt" }
: strvec_t{};
#if ROCPROFSYS_USE_ROCTRACER == 0 && ROCPROFSYS_USE_ROCPROFILER == 0
_realtime_reqs.clear();
#endif
const auto* _trace_policy_desc =
R"(Policy for new data when the buffer size limit is reached:
%{INDENT}%- discard : new data is ignored
@@ -579,45 +559,29 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_backend_choices.erase("rcclp");
#endif
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
#if !defined(ROCPROFSYS_USE_ROCM)
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocm-smi");
#endif
#if !defined(ROCPROFSYS_USE_ROCTRACER)
_backend_choices.erase("roctracer");
_backend_choices.erase("roctx");
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
_backend_choices.erase("rocprofiler");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
#endif
if(gpu::device_count() == 0)
{
// remove GPU-specific backends
_backend_choices.erase("rcclp");
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocm-smi");
_backend_choices.erase("roctracer");
_backend_choices.erase("rocprofiler");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
#if defined(ROCPROFSYS_USE_RCCL)
update_env(_data, "ROCPROFSYS_USE_RCCLP", false);
#endif
#if defined(ROCPROFSYS_USE_ROCM_SMI)
#if defined(ROCPROFSYS_USE_ROCM)
update_env(_data, "ROCPROFSYS_USE_ROCM_SMI", false);
#endif
#if defined(ROCPROFSYS_USE_ROCTRACER)
update_env(_data, "ROCPROFSYS_USE_ROCTRACER", false);
update_env(_data, "ROCPROFSYS_USE_ROCTX", false);
update_env(_data, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY", false);
update_env(_data, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY", false);
_backend_choices.erase("roctracer");
_backend_choices.erase("roctx");
#endif
#if defined(ROCPROFSYS_USE_ROCPROFILER)
update_env(_data, "ROCPROFSYS_USE_ROCPROFILER", false);
update_env(_data, "ROCPROFSYS_USE_ROCM", false);
#endif
}
@@ -640,11 +604,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
@@ -676,28 +638,13 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_update("ROCPROFSYS_USE_KOKKOSP", _v.count("kokkosp") > 0);
_update("ROCPROFSYS_USE_MPIP", _v.count("mpip") > 0);
_update("ROCPROFSYS_USE_OMPT", _v.count("ompt") > 0);
_update("ROCPROFSYS_USE_ROCM", _v.count("rocm") > 0);
_update("ROCPROFSYS_USE_RCCLP", _v.count("rcclp") > 0);
_update("ROCPROFSYS_USE_ROCTX", _v.count("roctx") > 0);
_update("ROCPROFSYS_USE_ROCM_SMI", _v.count("rocm-smi") > 0);
_update("ROCPROFSYS_USE_ROCTRACER", _v.count("roctracer") > 0);
_update("ROCPROFSYS_USE_ROCPROFILER", _v.count("rocprofiler") > 0);
_update("ROCPROFSYS_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0);
_update("ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0);
if(_v.count("all") > 0 ||
(_v.count("roctracer") > 0 && _v.count("rocprofiler") > 0))
{
remove_env(_data, "HSA_TOOLS_LIB");
remove_env(_data, "HSA_TOOLS_REPORT_LOAD_FAILURE");
}
if(_v.count("all") > 0 || _v.count("rocprofiler") > 0)
{
remove_env(_data, "ROCP_TOOL_LIB");
remove_env(_data, "ROCP_HSA_INTERCEPT");
}
if(_v.count("all") > 0 || _v.count("ompt") > 0)
remove_env(_data, "OMP_TOOL_LIBRARIES");
@@ -1126,7 +1073,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_parser.add_argument({ "--sample-realtime" }, _realtime_desc)
.min_count(0)
.dtype("[freq] [delay] [tids...]")
.required(std::move(_realtime_reqs))
.action([&](parser_t& p) {
auto _v = p.get<std::deque<std::string>>("sample-realtime");
update_env(_data, "ROCPROFSYS_SAMPLING_REALTIME", true);
@@ -1210,25 +1156,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_data.processed_environs.emplace("papi_events");
}
#if defined(ROCPROFSYS_USE_ROCPROFILER)
if(_data.environ_filter("gpu_events", _data))
{
_parser
.add_argument({ "-G", "--gpu-events" },
"Set the GPU hardware counter events to record (ref: "
"`rocprof-sys-avail -H -c GPU`)")
.min_count(1)
.dtype("[EVENT ...]")
.action([&](parser_t& p) {
auto _events = join(array_config_t{ "," }, p.get<strvec_t>("gpu-events"));
update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events);
});
_data.processed_environs.emplace("gpu_events");
_data.processed_environs.emplace("rocm_events");
}
#endif
add_group_arguments(_parser, "category", _data, true);
add_group_arguments(_parser, "io", _data, true);
add_group_arguments(_parser, "perfetto", _data, true);
+18 -14
ファイルの表示
@@ -91,19 +91,21 @@ ROCPROFSYS_DEFINE_CATEGORY(project, rocprofsys, ROCPROFSYS_CATEGORY_NONE, "rocpr
ROCPROFSYS_DEFINE_CATEGORY(category, host, ROCPROFSYS_CATEGORY_HOST, "host", "Host-side function tracing")
ROCPROFSYS_DEFINE_CATEGORY(category, user, ROCPROFSYS_CATEGORY_USER, "user", "User-defined regions")
ROCPROFSYS_DEFINE_CATEGORY(category, python, ROCPROFSYS_CATEGORY_PYTHON, "python", "Python regions")
ROCPROFSYS_DEFINE_CATEGORY(category, device_hip, ROCPROFSYS_CATEGORY_DEVICE_HIP, "device_hip", "Device-side functions submitted via HIP API")
ROCPROFSYS_DEFINE_CATEGORY(category, device_hsa, ROCPROFSYS_CATEGORY_DEVICE_HSA, "device_hsa", "Device-side functions submitted via HSA API")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip, ROCPROFSYS_CATEGORY_ROCM_HIP, "rocm_hip", "Host-side HIP functions")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa, ROCPROFSYS_CATEGORY_ROCM_HSA, "rocm_hsa", "Host-side HSA functions")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_roctx, ROCPROFSYS_CATEGORY_ROCM_ROCTX, "rocm_roctx", "ROCTx labels")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm, ROCPROFSYS_CATEGORY_ROCM, "rocm", "General ROCm tracing")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip_api, ROCPROFSYS_CATEGORY_ROCM_HIP_API, "rocm_hip_api", "ROCm HIP functions")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa_api, ROCPROFSYS_CATEGORY_ROCM_HSA_API, "rocm_hsa_api", "ROCm HSA functions")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_kernel_dispatch, ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, "rocm_kernel_dispatch", "ROCm Kernel dispatch")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_memory_copy, ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, "rocm_memory_copy", "ROCm Async Memory Copy")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_scratch_memory, ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, "rocm_scratch_memory", "ROCm kernel scratch memory reallocations")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_page_migration, ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, "rocm_page_migration", "ROCm memory page migration")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, "rocm_counter_collection", "ROCm device counter collection")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi, ROCPROFSYS_CATEGORY_ROCM_SMI, "rocm_smi", "rocm-smi data")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY, "device_busy", "Busy percentage of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
ROCPROFSYS_DEFINE_CATEGORY(category, roctracer, ROCPROFSYS_CATEGORY_ROCTRACER, "roctracer", "Kernel tracing provided by roctracer")
ROCPROFSYS_DEFINE_CATEGORY(category, rocprofiler, ROCPROFSYS_CATEGORY_ROCPROFILER, "rocprofiler", "HW counter data provided by rocprofiler")
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
ROCPROFSYS_DEFINE_CATEGORY(category, mpi, ROCPROFSYS_CATEGORY_MPI, "mpi", "MPI regions")
@@ -151,19 +153,21 @@ using name = perfetto_category<Tp...>;
ROCPROFSYS_PERFETTO_CATEGORY(category::user), \
ROCPROFSYS_PERFETTO_CATEGORY(category::python), \
ROCPROFSYS_PERFETTO_CATEGORY(category::sampling), \
ROCPROFSYS_PERFETTO_CATEGORY(category::device_hip), \
ROCPROFSYS_PERFETTO_CATEGORY(category::device_hsa), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_roctx), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_kernel_dispatch), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_memory_copy), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_scratch_memory), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_page_migration), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_counter_collection), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_busy), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
ROCPROFSYS_PERFETTO_CATEGORY(category::roctracer), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocprofiler), \
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
ROCPROFSYS_PERFETTO_CATEGORY(category::mpi), \
+1 -9
ファイルの表示
@@ -96,14 +96,6 @@ struct functors;
} // namespace component
} // namespace rocprofsys
#if !defined(ROCPROFSYS_USE_ROCTRACER)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type)
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
#endif
#if !defined(ROCPROFSYS_USE_RCCL)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, category::rocm_rccl, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
@@ -124,7 +116,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_cpu_clock, fa
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_percent, false_type)
#endif
#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM_SMI)
#if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(ROCPROFSYS_USE_ROCM)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type)
+57 -200
ファイルの表示
@@ -22,6 +22,7 @@
#include "config.hpp"
#include "common/defines.h"
#include "common/static_object.hpp"
#include "constraint.hpp"
#include "debug.hpp"
#include "defines.hpp"
@@ -29,9 +30,9 @@
#include "mproc.hpp"
#include "perf.hpp"
#include "perfetto.hpp"
#include "rocprofiler-sdk.hpp"
#include "utility.hpp"
#include <asm-generic/errno-base.h>
#include <timemory/backends/capability.hpp>
#include <timemory/backends/dmp.hpp>
#include <timemory/backends/mpi.hpp>
@@ -52,6 +53,7 @@
#include <timemory/utility/filepath.hpp>
#include <timemory/utility/join.hpp>
#include <timemory/utility/signals.hpp>
#include <timemory/utility/types.hpp>
#include <algorithm>
#include <array>
@@ -60,6 +62,7 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <exception>
#include <fstream>
#include <limits>
#include <linux/capability.h>
@@ -67,6 +70,7 @@
#include <ostream>
#include <sstream>
#include <string>
#include <type_traits>
#include <unistd.h>
#include <utility>
@@ -76,6 +80,11 @@ using settings = tim::settings;
namespace
{
int verbose_value = tim::get_env<int>("ROCPROFSYS_VERBOSE", 0, false);
bool debug_value = tim::get_env<bool>("ROCPROFSYS_DEBUG", false, false);
bool is_ci_value = tim::get_env<bool>("ROCPROFSYS_CI", false, false);
auto configure_once = std::once_flag{};
TIMEMORY_NOINLINE bool&
_settings_are_configured()
{
@@ -83,6 +92,14 @@ _settings_are_configured()
return _v;
}
auto*&
get_config_impl()
{
static auto*& _v = common::static_object<std::shared_ptr<settings>>::construct(
common::do_not_destroy{}, settings::shared_instance());
return _v;
}
auto
get_config()
{
@@ -97,7 +114,7 @@ get_config()
std::string
get_setting_name(std::string _v)
{
static const auto _prefix = tim::string_view_t{ "rocprofsys_" };
constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" };
for(auto& itr : _v)
itr = tolower(itr);
auto _pos = _v.find(_prefix);
@@ -195,7 +212,7 @@ configure_settings(bool _init)
if(settings_are_configured()) return;
if(get_is_continuous_integration() && get_state() < State::Init)
if(is_ci_value && get_state() < State::Init)
{
timemory_print_demangled_backtrace<64>();
ROCPROFSYS_THROW("config::configure_settings() called before "
@@ -220,17 +237,17 @@ configure_settings(bool _init)
tim::manager::add_metadata("ROCPROFSYS_COMPILER_VERSION",
ROCPROFSYS_COMPILER_VERSION);
#if ROCPROFSYS_HIP_VERSION > 0
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION", ROCPROFSYS_HIP_VERSION_STRING);
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MAJOR",
ROCPROFSYS_HIP_VERSION_MAJOR);
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_MINOR",
ROCPROFSYS_HIP_VERSION_MINOR);
tim::manager::add_metadata("ROCPROFSYS_HIP_VERSION_PATCH",
ROCPROFSYS_HIP_VERSION_PATCH);
#if ROCPROFSYS_ROCM_VERSION > 0
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION", ROCPROFSYS_ROCM_VERSION_STRING);
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MAJOR",
ROCPROFSYS_ROCM_VERSION_MAJOR);
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_MINOR",
ROCPROFSYS_ROCM_VERSION_MINOR);
tim::manager::add_metadata("ROCPROFSYS_ROCM_VERSION_PATCH",
ROCPROFSYS_ROCM_VERSION_PATCH);
#endif
auto _config = settings::shared_instance();
auto _config = *get_config_impl();
// if using timemory, default to perfetto being off
auto _default_perfetto_v = !tim::get_env<bool>("ROCPROFSYS_PROFILE", false, false);
@@ -294,24 +311,15 @@ configure_settings(bool _init)
"Enable causal profiling analysis", false, "backend",
"causal", "analysis");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCTRACER",
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM",
"Enable ROCm API and kernel tracing", true, "backend",
"roctracer", "rocm");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCPROFILER",
"Enable ROCm hardware counters", true, "backend",
"rocprofiler", "rocm");
"rocm");
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_USE_ROCM_SMI",
"Enable sampling GPU power, temp, utilization, and memory usage", true, "backend",
"rocm_smi", "rocm", "process_sampling");
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_USE_ROCTX",
"Enable ROCtx API. Warning! Out-of-order ranges may corrupt perfetto flamegraph",
false, "backend", "roctracer", "rocm", "roctx");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_SAMPLING",
"Enable statistical sampling of call-stack", false,
"backend", "sampling");
@@ -616,41 +624,7 @@ configure_settings(bool _init)
"sampling", "hardware_counters")
->set_choices(perf::get_config_choices());
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_API",
"Enable HIP API tracing support", true, "roctracer", "rocm",
"advanced");
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE",
"Enable annotating the perfetto debug annotation with backtraces", false,
"roctracer", "rocm", "perfetto", "advanced");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HIP_ACTIVITY",
"Enable HIP activity tracing support", true, "roctracer",
"rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_ACTIVITY",
"Enable HSA activity tracing support", false, "roctracer",
"rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_HSA_API",
"Enable HSA API tracing support", false, "roctracer",
"rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCTRACER_HSA_API_TYPES",
"HSA API type to collect", "", "roctracer", "rocm",
"advanced");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS",
"Skip barrier marker events in traces", false, "roctracer",
"rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(
std::string, "ROCPROFSYS_ROCM_EVENTS",
"ROCm hardware counters. Use ':device=N' syntax to specify collection on device "
"number N, e.g. ':device=0'. If no device specification is provided, the event "
"is collected on every available device",
"", "rocprofiler", "rocm", "hardware_counters");
rocprofiler_sdk::config_settings(_config);
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS",
"rocm-smi metrics to collect: busy, temp, power, mem_usage",
@@ -670,12 +644,6 @@ configure_settings(bool _init)
"default to the value of ROCPROFSYS_COLLAPSE_PROCESSES",
false, "perfetto", "data", "advanced");
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM",
"Separate roctracer GPU side traces (copies, kernels) into separate "
"tracks based on the stream they're enqueued into",
true, "perfetto", "roctracer", "rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(
std::string, "ROCPROFSYS_PERFETTO_FILL_POLICY",
"Behavior when perfetto buffer is full. 'discard' will ignore new entries, "
@@ -704,18 +672,6 @@ configure_settings(bool _init)
"feature may dramatically reduce the size of the trace",
true, "perfetto", "data", "debugging", "advanced");
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS",
"When PERFETTO_ANNOTATIONS, USE_ROCTRACER, and ROCTRACER_HIP_API are all "
"enabled, enabling this option will result in the arg information for HIP API "
"calls to all be within one annotation (e.g., args=\"stream=0x0, dst=0x1F, "
"sizeBytes=64, src=0x08, kind=1\"). When disabled, each parameter will be an "
"individual annotation (e.g. stream, dst, sizeBytes, etc.). The benefit of the "
"former is that it is faster to serialize and consumes less file space; the "
"benefit of the latter is that it becomes much easier to find slices in the "
"trace with the same value",
false, "perfetto", "data", "debugging", "roctracer", "rocm", "advanced");
ROCPROFSYS_CONFIG_SETTING(
uint64_t, "ROCPROFSYS_THREAD_POOL_SIZE",
"Max number of threads for processing background tasks",
@@ -1045,6 +1001,10 @@ configure_settings(bool _init)
settings::suppress_config() = true;
if(auto opt = get_setting_value<int>("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt;
if(auto opt = get_setting_value<bool>("ROCPROFSYS_DEBUG"); opt) debug_value = *opt;
if(auto opt = get_setting_value<bool>("ROCPROFSYS_CI"); opt) is_ci_value = *opt;
if(get_env("ROCPROFSYS_MONOCHROME", _config->get<bool>("ROCPROFSYS_MONOCHROME")))
tim::log::monochrome() = true;
@@ -1106,6 +1066,10 @@ configure_settings(bool _init)
ROCPROFSYS_BASIC_VERBOSE(2, "configuration complete\n");
if(auto opt = get_setting_value<int>("ROCPROFSYS_VERBOSE"); opt) verbose_value = *opt;
if(auto opt = get_setting_value<bool>("ROCPROFSYS_DEBUG"); opt) debug_value = *opt;
if(auto opt = get_setting_value<bool>("ROCPROFSYS_CI"); opt) is_ci_value = *opt;
_settings_are_configured() = true;
}
@@ -1140,8 +1104,6 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
_set("ROCPROFSYS_PROFILE", false);
_set("ROCPROFSYS_USE_CAUSAL", false);
_set("ROCPROFSYS_USE_ROCM_SMI", false);
_set("ROCPROFSYS_USE_ROCTRACER", false);
_set("ROCPROFSYS_USE_ROCPROFILER", false);
_set("ROCPROFSYS_USE_KOKKOSP", false);
_set("ROCPROFSYS_USE_RCCLP", false);
_set("ROCPROFSYS_USE_OMPT", false);
@@ -1164,12 +1126,11 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
if(gpu::device_count() == 0)
{
#if ROCPROFSYS_HIP_VERSION > 0
ROCPROFSYS_BASIC_VERBOSE(1, "No HIP devices were found: disabling roctracer, "
"rocprofiler, and rocm_smi...\n");
#if ROCPROFSYS_ROCM_VERSION > 0
ROCPROFSYS_BASIC_VERBOSE(
1, "No ROCm devices were found: disabling rocm and rocm_smi...\n");
#endif
_set("ROCPROFSYS_USE_ROCPROFILER", false);
_set("ROCPROFSYS_USE_ROCTRACER", false);
_set("ROCPROFSYS_USE_ROCM", false);
_set("ROCPROFSYS_USE_ROCM_SMI", false);
}
@@ -1202,9 +1163,8 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
_set("ROCPROFSYS_USE_TRACE", false);
_set("ROCPROFSYS_PROFILE", false);
_set("ROCPROFSYS_USE_CAUSAL", false);
_set("ROCPROFSYS_USE_ROCM", false);
_set("ROCPROFSYS_USE_ROCM_SMI", false);
_set("ROCPROFSYS_USE_ROCTRACER", false);
_set("ROCPROFSYS_USE_ROCPROFILER", false);
_set("ROCPROFSYS_USE_KOKKOSP", false);
_set("ROCPROFSYS_USE_RCCLP", false);
_set("ROCPROFSYS_USE_OMPT", false);
@@ -1389,22 +1349,9 @@ configure_disabled_settings(const std::shared_ptr<settings>& _config)
_handle_use_option("ROCPROFSYS_USE_OMPT", "ompt");
_handle_use_option("ROCPROFSYS_USE_RCCLP", "rcclp");
_handle_use_option("ROCPROFSYS_USE_ROCM_SMI", "rocm_smi");
_handle_use_option("ROCPROFSYS_USE_ROCTRACER", "roctracer");
_handle_use_option("ROCPROFSYS_USE_ROCPROFILER", "rocprofiler");
_handle_use_option("ROCPROFSYS_USE_ROCM", "rocm");
#if !defined(ROCPROFSYS_USE_ROCTRACER) || ROCPROFSYS_USE_ROCTRACER == 0
_config->find("ROCPROFSYS_USE_ROCTRACER")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("roctracer"))
_config->find(itr)->second->set_hidden(true);
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0
_config->find("ROCPROFSYS_USE_ROCPROFILER")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("rocprofiler"))
_config->find(itr)->second->set_hidden(true);
#endif
#if !defined(ROCPROFSYS_USE_ROCM_SMI) || ROCPROFSYS_USE_ROCM_SMI == 0
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
_config->find("ROCPROFSYS_USE_ROCM_SMI")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("rocm_smi"))
_config->find(itr)->second->set_hidden(true);
@@ -1567,7 +1514,7 @@ print_banner(std::ostream& _os)
{ "tag", ROCPROFSYS_GIT_DESCRIBE },
{ "", ROCPROFSYS_LIBRARY_ARCH },
{ "compiler", ROCPROFSYS_COMPILER_STRING },
{ "rocm", ROCPROFSYS_HIP_VERSION_COMPAT_STRING } });
{ "rocm", ROCPROFSYS_ROCM_VERSION_COMPAT_STRING } });
// <NAME> <VERSION> (<PROPERTIES>)
if(!_properties.empty())
@@ -1797,10 +1744,7 @@ get_debug_env()
bool
get_is_continuous_integration()
{
if(!settings_are_configured())
return tim::get_env<bool>("ROCPROFSYS_CI", false, false);
static auto _v = get_config()->find("ROCPROFSYS_CI");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
return is_ci_value;
}
bool
@@ -1818,8 +1762,8 @@ get_debug_finalize()
bool
get_debug()
{
static auto _v = get_config()->find("ROCPROFSYS_DEBUG");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
std::call_once(configure_once, []() { (void) get_config(); });
return debug_value;
}
bool
@@ -1842,15 +1786,15 @@ get_verbose_env()
int
get_verbose()
{
static auto _v = get_config()->find("ROCPROFSYS_VERBOSE");
return static_cast<tim::tsettings<int>&>(*_v->second).get();
std::call_once(configure_once, []() { (void) get_config(); });
return verbose_value;
}
bool&
get_use_perfetto()
{
static auto _v = get_config()->find("ROCPROFSYS_TRACE");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
static auto _v = get_config()->at("ROCPROFSYS_TRACE");
return static_cast<tim::tsettings<bool>&>(*_v).get();
}
bool&
@@ -1867,43 +1811,10 @@ get_use_causal()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_use_roctracer()
{
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTRACER");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
return false;
#endif
}
bool
get_perfetto_roctracer_per_stream()
{
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_ROCTRACER_PER_STREAM");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
return false;
#endif
}
bool
get_use_rocprofiler()
{
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCPROFILER");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
return false;
#endif
}
bool
get_use_rocm_smi()
{
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCM_SMI");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
@@ -1911,17 +1822,6 @@ get_use_rocm_smi()
#endif
}
bool
get_use_roctx()
{
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
static auto _v = get_config()->find("ROCPROFSYS_USE_ROCTX");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
return false;
#endif
}
bool&
get_use_sampling()
{
@@ -2031,34 +1931,6 @@ get_sampling_cputime_signal()
return static_cast<tim::tsettings<int>&>(*_v->second).get();
}
bool
get_trace_hip_api()
{
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_API");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_trace_hip_activity()
{
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HIP_ACTIVITY");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_trace_hsa_api()
{
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_API");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_trace_hsa_activity()
{
static auto _v = get_config()->find("ROCPROFSYS_ROCTRACER_HSA_ACTIVITY");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
size_t
get_perfetto_shmem_size_hint()
{
@@ -2176,14 +2048,6 @@ get_thread_pool_size()
return _v;
}
std::string
get_trace_hsa_api_types()
{
static std::string _v =
get_config()->get<std::string>("ROCPROFSYS_ROCTRACER_HSA_API_TYPES");
return _v;
}
std::string&
get_perfetto_backend()
{
@@ -2360,7 +2224,7 @@ get_process_sampling_duration()
std::string
get_sampling_gpus()
{
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
static auto _v = get_config()->find("ROCPROFSYS_SAMPLING_GPUS");
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
#else
@@ -2375,13 +2239,6 @@ get_trace_thread_locks()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
std::string
get_rocm_events()
{
static auto _v = get_config()->find("ROCPROFSYS_ROCM_EVENTS");
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
}
bool
get_trace_thread_rwlocks()
{
+21 -41
ファイルの表示
@@ -101,17 +101,22 @@ get_exe_realpath();
template <typename Tp>
bool
set_setting_value(const std::string& _name, Tp&& _v)
set_setting_value(const std::string& _name, Tp&& _v,
settings::update_type _upd = settings::update_type::user)
{
auto _user_upd = tim::settings::update_type::user;
auto _instance = tim::settings::shared_instance();
auto _setting = _instance->find(_name);
auto* _instance = tim::settings::instance();
if(!_instance) return false;
auto _setting = _instance->find(_name);
if(_setting == _instance->end()) return false;
if(!_setting->second) return false;
auto& itr = _setting->second;
auto _upd = itr->set_user_updated();
auto _success = itr->set(std::forward<Tp>(_v), _user_upd);
if(!_success) itr->set_updated(_upd);
auto _old_upd = itr->get_updated_type();
auto _success = itr->set(std::forward<Tp>(_v), _upd);
if(!_success) itr->set_updated(_old_upd);
return _success;
}
@@ -119,10 +124,13 @@ template <typename Tp>
bool
set_default_setting_value(const std::string& _name, Tp&& _v)
{
auto _instance = tim::settings::shared_instance();
auto _setting = _instance->find(_name);
auto* _instance = tim::settings::instance();
if(!_instance) return false;
auto _setting = _instance->find(_name);
if(_setting == _instance->end()) return false;
if(!_setting->second) return false;
if(_setting->second->get_config_updated() || _setting->second->get_environ_updated())
return false;
return _setting->second->set(std::forward<Tp>(_v));
@@ -132,10 +140,12 @@ template <typename Tp>
std::optional<Tp>
get_setting_value(const std::string& _name)
{
auto _instance = tim::settings::shared_instance();
if(!_instance) return std::optional<Tp>{};
auto* _instance = tim::settings::instance();
if(!_instance) return std::nullopt;
auto _setting = _instance->find(_name);
if(_setting == _instance->end() || !_setting->second) return std::optional<Tp>{};
auto&& _ret = _setting->second->get<Tp>();
return (_ret.first) ? std::optional<Tp>{ _ret.second } : std::optional<Tp>{};
}
@@ -194,18 +204,9 @@ get_use_timemory() ROCPROFSYS_HOT;
bool&
get_use_causal() ROCPROFSYS_HOT;
bool
get_use_roctracer() ROCPROFSYS_HOT;
bool
get_use_rocprofiler() ROCPROFSYS_HOT;
bool
get_use_rocm_smi() ROCPROFSYS_HOT;
bool
get_use_roctx();
bool&
get_use_sampling() ROCPROFSYS_HOT;
@@ -236,18 +237,6 @@ get_sampling_keep_internal();
bool
get_use_rcclp();
bool
get_trace_hip_api();
bool
get_trace_hip_activity();
bool
get_trace_hsa_api();
bool
get_trace_hsa_activity();
size_t
get_perfetto_shmem_size_hint();
@@ -272,9 +261,6 @@ get_perfetto_annotations() ROCPROFSYS_HOT;
uint64_t
get_thread_pool_size();
std::string
get_trace_hsa_api_types();
std::string&
get_perfetto_backend();
@@ -282,9 +268,6 @@ get_perfetto_backend();
std::string
get_perfetto_output_filename();
bool
get_perfetto_roctracer_per_stream() ROCPROFSYS_HOT;
double
get_trace_delay();
@@ -360,9 +343,6 @@ get_trace_thread_barriers();
bool
get_trace_thread_join();
std::string
get_rocm_events();
bool
get_use_tmp_files();
+9 -9
ファイルの表示
@@ -209,7 +209,7 @@ public:
void push_back(Tp&& t);
template <typename... Args>
void emplace_back(Args&&... args);
decltype(auto) emplace_back(Args&&... args);
reference operator[](size_type i);
@@ -229,6 +229,14 @@ private:
storage_type m_chunks;
};
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
template <typename... Args>
decltype(auto)
stable_vector<Tp, ChunkSizeV, AlignN>::emplace_back(Args&&... args)
{
return last_chunk().emplace_back(std::forward<Args>(args)...);
}
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
stable_vector<Tp, ChunkSizeV, AlignN>::stable_vector(size_type count, const Tp& value)
{
@@ -332,14 +340,6 @@ stable_vector<Tp, ChunkSizeV, AlignN>::push_back(Tp&& t)
last_chunk().push_back(std::move(t));
}
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
template <typename... Args>
void
stable_vector<Tp, ChunkSizeV, AlignN>::emplace_back(Args&&... args)
{
last_chunk().emplace_back(std::forward<Args>(args)...);
}
template <typename Tp, size_t ChunkSizeV, size_t AlignN>
typename stable_vector<Tp, ChunkSizeV, AlignN>::reference
stable_vector<Tp, ChunkSizeV, AlignN>::operator[](size_type i)
+69 -349
ファイルの表示
@@ -20,22 +20,19 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_BEGIN \
namespace tim \
{ \
namespace cereal \
{
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_END \
} \
} // namespace ::tim::cereal
#include "common/defines.h"
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
# define ROCPROFSYS_USE_ROCM_SMI 0
#endif
#if !defined(ROCPROFSYS_USE_HIP)
# define ROCPROFSYS_USE_HIP 0
#endif
#include "core/hip_runtime.hpp"
#if ROCPROFSYS_USE_HIP > 0
# if !defined(TIMEMORY_USE_HIP)
# define TIMEMORY_USE_HIP 1
# endif
#if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0
#endif
#include "debug.hpp"
@@ -44,24 +41,11 @@
#include <timemory/manager.hpp>
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
# include <rocm_smi/rocm_smi.h>
#endif
#if ROCPROFSYS_USE_HIP > 0
# include <timemory/components/hip/backends.hpp>
# if !defined(ROCPROFSYS_HIP_RUNTIME_CALL)
# define ROCPROFSYS_HIP_RUNTIME_CALL(err) \
{ \
if(err != ::tim::hip::success_v && (int) err != 0) \
{ \
ROCPROFSYS_THROW( \
"[%s:%d] Warning! HIP API call failed with code %i :: %s\n", \
__FILE__, __LINE__, (int) err, hipGetErrorString(err)); \
} \
}
# endif
# include <rocprofiler-sdk/agent.h>
# include <rocprofiler-sdk/cxx/serialization.hpp>
# include <rocprofiler-sdk/fwd.h>
#endif
namespace rocprofsys
@@ -70,9 +54,7 @@ namespace gpu
{
namespace
{
namespace scope = ::tim::scope;
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
# define ROCPROFSYS_ROCM_SMI_CALL(ERROR_CODE) \
::rocprofsys::gpu::check_rsmi_error(ERROR_CODE, __FILE__, __LINE__)
@@ -108,99 +90,47 @@ rsmi_init()
return _rsmi_init;
}
#endif
#endif // ROCPROFSYS_USE_ROCM > 0
#if ROCPROFSYS_HIP_VERSION >= 60000
template <typename ArchiveT, typename ArgT,
std::enable_if_t<!std::is_pointer<ArgT>::value, int> = 0>
void
device_prop_serialize(ArchiveT& archive, const char* name, const ArgT& arg)
int32_t
query_rocm_gpu_agents()
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
archive(make_nvp(name, arg));
}
template <typename ArchiveT, typename ArgT, size_t N>
void
device_prop_serialize(ArchiveT& archive, const char* name, ArgT arg[N])
{
if constexpr(!std::is_same<ArgT, char>::value &&
!std::is_same<ArgT, const char>::value)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
auto data = std::array<int, N>{};
for(size_t i = 0; i < N; ++i)
data[i] = arg[i];
archive(make_nvp(name, data));
}
else
{
device_prop_serialize(archive, name, std::string{ arg });
}
}
template <typename ArchiveT>
void
device_prop_serialize(ArchiveT& archive, const char* name, hipUUID_t arg)
{
constexpr auto N = sizeof(arg.bytes);
namespace cereal = tim::cereal;
using cereal::make_nvp;
auto data = std::array<char, N + 1>{};
data.fill('\0');
for(size_t i = 0; i < N; ++i)
data[i] = arg.bytes[i];
auto str_v = std::string_view{ data.data() };
auto str = std::string{ str_v }.substr(0, str_v.find('\0'));
archive(make_nvp(name, str));
}
template <typename ArchiveT>
void
device_prop_serialize(ArchiveT& archive, const char* name, hipDeviceArch_t arg)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(NAME) \
{ \
auto val = arg.NAME; \
archive(make_nvp(#NAME, val)); \
int32_t _dev_cnt = 0;
#if ROCPROFSYS_USE_ROCM > 0
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
size_t num_agents, void* user_data) -> rocprofiler_status_t {
auto* _cnt = static_cast<int32_t*>(user_data);
for(size_t i = 0; i < num_agents; ++i)
{
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
if(_agent && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) *_cnt += 1;
}
return ROCPROFILER_STATUS_SUCCESS;
};
archive.setNextName(name);
archive.startNode();
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt32Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalFloatAtomicExch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt32Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedFloatAtomicExch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFloatAtomicAdd)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt64Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt64Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDoubles)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpVote)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpBallot)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpShuffle)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFunnelShift)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasThreadFenceSystem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSyncThreadsExt)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSurfaceFuncs)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(has3dGrid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDynamicParallelism)
archive.finishNode();
# undef ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH
}
try
{
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
} catch(std::exception& _e)
{
ROCPROFSYS_BASIC_VERBOSE(
1, "Exception thrown getting the rocm agents: %s. _dev_cnt=%d\n", _e.what(),
_dev_cnt);
}
// rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
// sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
#endif
return _dev_cnt;
}
} // namespace
int
hip_device_count()
rocm_device_count()
{
#if ROCPROFSYS_USE_HIP > 0
return ::tim::hip::device_count();
#if ROCPROFSYS_USE_ROCM > 0
static int _num_devices = query_rocm_gpu_agents();
return _num_devices;
#else
return 0;
#endif
@@ -209,7 +139,7 @@ hip_device_count()
int
rsmi_device_count()
{
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
if(!rsmi_init()) return 0;
static auto _num_devices = []() {
@@ -234,11 +164,8 @@ rsmi_device_count()
int
device_count()
{
#if ROCPROFSYS_USE_ROCM_SMI > 0
// store as static since calls after rsmi_shutdown will return zero
return rsmi_device_count();
#elif ROCPROFSYS_USE_HIP > 0
return ::tim::hip::device_count();
#if ROCPROFSYS_USE_ROCM > 0
return rocm_device_count();
#else
return 0;
#endif
@@ -246,251 +173,44 @@ device_count()
template <typename ArchiveT>
void
add_hip_device_metadata(ArchiveT& ar)
add_device_metadata(ArchiveT& ar)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
#if ROCPROFSYS_USE_HIP > 0
int _device_count = 0;
int _current_device = 0;
hipError_t _device_count_err = hipGetDeviceCount(&_device_count);
#if ROCPROFSYS_USE_ROCM > 0
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
if(_device_count_err != hipSuccess) return;
hipError_t _current_device_err = hipGetDevice(&_current_device);
scope::destructor _dtor{ [_current_device, _current_device_err]() {
if(_current_device_err == hipSuccess)
auto _agents_vec = agent_vec_t{};
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
size_t num_agents, void* user_data) -> rocprofiler_status_t {
auto* _agents_vec_v = static_cast<agent_vec_t*>(user_data);
_agents_vec_v->reserve(num_agents);
for(size_t i = 0; i < num_agents; ++i)
{
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(_current_device));
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
if(_agent) _agents_vec_v->emplace_back(*_agent);
}
} };
return ROCPROFILER_STATUS_SUCCESS;
};
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
sizeof(rocprofiler_agent_v0_t), &_agents_vec);
if(_current_device_err != hipSuccess || _device_count == 0) return;
ar.setNextName("hip_device_properties");
ar.startNode();
ar.makeArray();
scope::destructor _prop_dtor{ [&ar]() { ar.finishNode(); } };
for(int dev = 0; dev < _device_count; ++dev)
{
auto _device_prop = hipDeviceProp_t{};
int _driver_version = 0;
int _runtime_version = 0;
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(dev));
ROCPROFSYS_HIP_RUNTIME_CALL(hipGetDeviceProperties(&_device_prop, dev));
ROCPROFSYS_HIP_RUNTIME_CALL(hipDriverGetVersion(&_driver_version));
ROCPROFSYS_HIP_RUNTIME_CALL(hipRuntimeGetVersion(&_runtime_version));
ar.startNode();
# if ROCPROFSYS_HIP_VERSION < 60000
using intvec_t = std::vector<int>;
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
ar(make_nvp(#NAME, _device_prop.NAME));
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \
ar(make_nvp(NAME, __VA_ARGS__));
ar(make_nvp("name", std::string{ _device_prop.name }));
ar(make_nvp("driver_version", _driver_version));
ar(make_nvp("runtime_version", _runtime_version));
ar(make_nvp("capability.major_version", _device_prop.major));
ar(make_nvp("capability.minor_version", _device_prop.minor));
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
# if ROCPROFSYS_HIP_VERSION >= 50000
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
# endif
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
"maxThreadsDim",
intvec_t{ _device_prop.maxThreadsDim[0], _device_prop.maxThreadsDim[1],
_device_prop.maxThreadsDim[2] })
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
"maxGridSize",
intvec_t{ _device_prop.maxGridSize[0], _device_prop.maxGridSize[1],
_device_prop.maxGridSize[2] })
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
# else
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
device_prop_serialize(ar, #NAME, _device_prop.NAME);
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(name)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(uuid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luidDeviceNodeMask)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxGridSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(major)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(minor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(texturePitchAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deviceOverlap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DMipmap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLinear)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DMipmap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLinear)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DGather)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3DAlt)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemapLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface3D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemapLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(surfaceAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(tccDriver)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asyncEngineCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedAddressing)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(persistingL2CacheMaxSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(streamPrioritiesSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(globalL1CacheSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(localL1CacheSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerMultiprocessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerMultiprocessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(managedMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiGpuBoardGroupID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostNativeAtomicSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(singleToDoublePrecisionPerfRatio)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computePreemptionSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canUseHostPointerForRegisteredMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlockOptin)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxBlocksPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(accessPolicyMaxWindowSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(reservedSharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sparseHipArraySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterReadOnlySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(timelineSemaphoreInteropSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolsSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMASupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAFlushWritesOptions)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAWritesOrdering)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolSupportedHandleTypes)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deferredMappingHipArraySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ipcEventSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clusterLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedFunctionPointers)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(arch)
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpMemFlushCntl)
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpRegFlushCntl)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedFunc)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedGridDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedBlockDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedSharedMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isLargeBar)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
# endif
const auto _compute_mode_descr = std::array<const char*, 6>{
"Default (multiple host threads can use ::hipSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::hipSetDevice() with this device)",
"Prohibited (no host thread can use ::hipSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use "
"::hipSetDevice() with this device)",
"Unknown",
nullptr
};
auto _compute_mode = std::min<int>(_device_prop.computeMode, 5);
ar(make_nvp("computeModeDescription",
std::string{ _compute_mode_descr.at(_compute_mode) }));
ar.finishNode();
}
ar(make_nvp("rocm_agents", _agents_vec));
#else
(void) ar;
#endif
}
void
add_hip_device_metadata()
add_device_metadata()
{
if(device_count() == 0) return;
ROCPROFSYS_METADATA([](auto& ar) {
try
{
add_hip_device_metadata(ar);
add_device_metadata(ar);
} catch(std::runtime_error& _e)
{
ROCPROFSYS_VERBOSE(2, "%s\n", _e.what());
+2 -2
ファイルの表示
@@ -30,12 +30,12 @@ int
device_count();
int
hip_device_count();
rocm_device_count();
int
rsmi_device_count();
void
add_hip_device_metadata();
add_device_metadata();
} // namespace gpu
} // namespace rocprofsys
+11 -16
ファイルの表示
@@ -24,7 +24,7 @@
#include "core/defines.hpp"
#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
# if defined(HIP_INCLUDE_HIP_HIP_RUNTIME_H) || \
defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H)
@@ -35,22 +35,17 @@
# define HIP_PROF_HIP_API_STRING 1
// following must be included before <roctracer_hip.h> for ROCm 6.0+
# if ROCPROFSYS_HIP_VERSION >= 60000
# if defined(USE_PROF_API)
# undef USE_PROF_API
# endif
# include <hip/hip_runtime.h>
# include <hip/hip_runtime_api.h>
// must be included after hip_runtime_api.h
# include <hip/hip_deprecated.h>
// must be included after hip_runtime_api.h
# include <hip_ostream_ops.h>
// must be included after hip_runtime_api.h
# include <hip/amd_detail/hip_prof_str.h>
# else
# include <hip/hip_runtime.h>
# include <hip/hip_runtime_api.h>
# if defined(USE_PROF_API)
# undef USE_PROF_API
# endif
# include <hip/hip_runtime.h>
# include <hip/hip_runtime_api.h>
// must be included after hip_runtime_api.h
# include <hip/hip_deprecated.h>
// must be included after hip_runtime_api.h
# include <roctracer/hip_ostream_ops.h>
// must be included after hip_runtime_api.h
# include <hip/amd_detail/hip_prof_str.h>
# include <hip/hip_version.h>
#endif
+1
ファイルの表示
@@ -104,6 +104,7 @@ perfetto_counter_track<Tp>::emplace(size_t _idx, const std::string& _v,
for(const auto& itr : _name_data)
{
_missing.emplace_back(std::make_tuple(*itr, itr->c_str(), false));
// TODO: _missing.emplace_back(*itr, itr->c_str(), false);
}
}
auto _index = _track_data.size();
+2 -8
ファイルの表示
@@ -23,13 +23,7 @@
#pragma once
#include "core/defines.hpp"
#include "core/hip_runtime.hpp"
#if defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0 && \
defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
# if ROCPROFSYS_HIP_VERSION == 0 || ROCPROFSYS_HIP_VERSION >= 50200
# include <rccl/rccl.h>
# else
# include <rccl.h>
# endif
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
# include <rccl/rccl.h>
#endif
+576
ファイルの表示
@@ -0,0 +1,576 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "core/rocprofiler-sdk.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "timemory.hpp"
#include <regex>
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
# include <timemory/defines.h>
# include <timemory/utility/demangle.hpp>
# include <rocprofiler-sdk/agent.h>
# include <rocprofiler-sdk/cxx/name_info.hpp>
# include <rocprofiler-sdk/fwd.h>
# include <algorithm>
# include <cstdint>
# include <set>
# include <sstream>
# include <string>
# include <unordered_set>
# include <vector>
# define ROCPROFILER_CALL(result) \
{ \
rocprofiler_status_t CHECKSTATUS = (result); \
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
{ \
auto msg = std::stringstream{}; \
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \
<< "rocprofiler-sdk call [" << #result \
<< "] failed with error code " << CHECKSTATUS \
<< " :: " << status_msg; \
ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \
} \
}
namespace rocprofsys
{
namespace rocprofiler_sdk
{
namespace
{
std::string
get_setting_name(std::string _v)
{
constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" };
for(auto& itr : _v)
itr = tolower(itr);
auto _pos = _v.find(_prefix);
if(_pos == 0) return _v.substr(_prefix.length());
return _v;
}
# define ROCPROFSYS_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \
[&]() { \
auto _ret = _config->insert<TYPE, TYPE>( \
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, \
TYPE{ INITIAL_VALUE }, \
std::set<std::string>{ "custom", "rocprofsys", "librocprof-sys", \
__VA_ARGS__ }); \
if(!_ret.second) \
{ \
ROCPROFSYS_PRINT("Warning! Duplicate setting: %s / %s\n", \
get_setting_name(ENV_NAME).c_str(), ENV_NAME); \
} \
return _config->find(ENV_NAME)->second; \
}()
template <typename Tp>
std::string
to_lower(const Tp& _val)
{
auto _v = std::string{ _val };
for(auto& itr : _v)
itr = ::tolower(itr);
return _v;
}
struct operation_options
{
std::string operations_include = {};
std::string operations_exclude = {};
std::string operations_annotate_backtrace = {};
};
auto callback_operation_option_names =
std::unordered_map<rocprofiler_callback_tracing_kind_t, operation_options>{};
auto buffered_operation_option_names =
std::unordered_map<rocprofiler_buffer_tracing_kind_t, operation_options>{};
std::unordered_set<int32_t>
get_operations_impl(rocprofiler_callback_tracing_kind_t kindv,
const std::string& optname = {})
{
static const auto callback_tracing_info =
rocprofiler::sdk::get_callback_tracing_names();
if(optname.empty())
{
auto _ret = std::unordered_set<int32_t>{};
for(auto iitr : callback_tracing_info[kindv].items())
{
if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first);
}
return _ret;
}
auto _val = get_setting_value<std::string>(optname);
ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str());
if(_val->empty()) return std::unordered_set<int32_t>{};
auto _ret = std::unordered_set<int32_t>{};
for(const auto& itr : tim::delimit(*_val, " ,;:\n\t"))
{
for(auto iitr : callback_tracing_info[kindv].items())
{
auto _re = std::regex{ itr, std::regex_constants::icase };
if(iitr.second && std::regex_search(iitr.second->data(), _re))
{
ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(),
itr.c_str(), iitr.second->data());
_ret.emplace(iitr.first);
}
}
}
return _ret;
}
std::unordered_set<int32_t>
get_operations_impl(rocprofiler_buffer_tracing_kind_t kindv,
const std::string& optname = {})
{
static const auto buffered_tracing_info =
rocprofiler::sdk::get_buffer_tracing_names();
if(optname.empty())
{
auto _ret = std::unordered_set<int32_t>{};
for(auto iitr : buffered_tracing_info[kindv].items())
{
if(iitr.second && *iitr.second != "none") _ret.emplace(iitr.first);
}
return _ret;
}
auto _val = get_setting_value<std::string>(optname);
ROCPROFSYS_CONDITIONAL_ABORT_F(!_val, "no setting %s\n", optname.c_str());
if(_val->empty()) return std::unordered_set<int32_t>{};
auto _ret = std::unordered_set<int32_t>{};
for(const auto& itr : tim::delimit(*_val, " ,;:\n\t"))
{
for(auto iitr : buffered_tracing_info[kindv].items())
{
auto _re = std::regex{ itr, std::regex_constants::icase };
if(iitr.second && std::regex_search(iitr.second->data(), _re))
{
ROCPROFSYS_PRINT_F("%s ('%s') matched: %s\n", optname.c_str(),
itr.c_str(), iitr.second->data());
_ret.emplace(iitr.first);
}
}
}
return _ret;
}
std::vector<int32_t>
get_operations_impl(const std::unordered_set<int32_t>& _complete,
const std::unordered_set<int32_t>& _include,
const std::unordered_set<int32_t>& _exclude)
{
auto _convert = [](const auto& _dset) {
auto _dret = std::vector<int32_t>{};
_dret.reserve(_dset.size());
for(auto itr : _dset)
_dret.emplace_back(itr);
std::sort(_dret.begin(), _dret.end());
return _dret;
};
if(_include.empty() && _exclude.empty()) return _convert(_complete);
auto _ret = (_include.empty()) ? _complete : _include;
for(auto itr : _exclude)
_ret.erase(itr);
return _convert(_ret);
}
} // namespace
void
config_settings(const std::shared_ptr<settings>& _config)
{
// const auto agents = std::vector<rocprofiler_agent_t>{};
const auto buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
auto _skip_domains =
std::unordered_set<std::string_view>{ "none",
"correlation_id_retirement",
"marker_core_api",
"marker_control_api",
"marker_name_api",
"code_object" };
auto _domain_choices = std::vector<std::string>{};
auto _add_domain = [&_domain_choices, &_skip_domains](std::string_view _domain) {
auto _v = to_lower(_domain);
if(_skip_domains.count(_v) == 0)
{
auto itr = std::find(_domain_choices.begin(), _domain_choices.end(), _v);
if(itr == _domain_choices.end()) _domain_choices.emplace_back(_v);
}
};
static auto _option_names = std::unordered_set<std::string>{};
auto _add_operation_settings = [&_config, &_skip_domains](
std::string_view _domain_name, const auto& _domain,
auto& _operation_option_names) {
auto _v = to_lower(_domain_name);
if(_skip_domains.count(_v) > 0) return;
auto _op_option_name = JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS");
auto _eop_option_name =
JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_EXCLUDE");
auto _bt_option_name =
JOIN('_', "ROCPROFSYS_ROCM", _domain_name, "OPERATIONS_ANNOTATE_BACKTRACE");
auto _op_choices = std::vector<std::string>{};
for(auto itr : _domain.operations)
_op_choices.emplace_back(std::string{ itr });
if(_op_choices.empty()) return;
_operation_option_names.emplace(
_domain.value,
operation_options{ _op_option_name, _eop_option_name, _bt_option_name });
if(_option_names.emplace(_op_option_name).second)
{
ROCPROFSYS_CONFIG_SETTING(
std::string, _op_option_name.c_str(),
"Inclusive filter for domain operations (for API domains, this selects "
"the functions to trace) [regex supported]",
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
->set_choices(_op_choices);
}
if(_option_names.emplace(_eop_option_name).second)
{
ROCPROFSYS_CONFIG_SETTING(
std::string, _eop_option_name.c_str(),
"Exclusive filter for domain operations applied after the inclusive "
"filter (for API domains, removes function from trace) [regex supported]",
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
->set_choices(_op_choices);
}
if(_option_names.emplace(_bt_option_name).second)
{
ROCPROFSYS_CONFIG_SETTING(
std::string, _bt_option_name.c_str(),
"Specification of domain operations which will record a backtrace (for "
"API domains, this is a list of function names) [regex supported]",
std::string{}, "rocm", "rocprofiler-sdk", "advanced")
->set_choices(_op_choices);
}
};
_domain_choices.reserve(buffered_tracing_info.size());
_domain_choices.reserve(callback_tracing_info.size());
_add_domain("hip_api");
_add_domain("hsa_api");
_add_domain("marker_api");
for(const auto& itr : buffered_tracing_info)
_add_domain(itr.name);
for(const auto& itr : callback_tracing_info)
_add_domain(itr.name);
std::sort(_domain_choices.begin(), _domain_choices.end());
namespace join = ::timemory::join;
auto _domain_description =
JOIN("", "Specification of ROCm domains to trace/profile. Choices: ",
join::join(join::array_config{ ", ", "", "" }, _domain_choices));
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_DOMAINS", _domain_description,
std::string{ "hip_runtime_api,marker_api,kernel_dispatch,"
"memory_copy,scratch_memory,page_migration" },
"rocm", "rocprofiler-sdk")
->set_choices(_domain_choices);
ROCPROFSYS_CONFIG_SETTING(
std::string, "ROCPROFSYS_ROCM_EVENTS",
"ROCm hardware counters. Use ':device=N' syntax to specify collection on device "
"number N, e.g. ':device=0'. If no device specification is provided, the event "
"is collected on every available device",
"", "rocm", "hardware_counters");
_skip_domains.emplace("kernel_dispatch");
_skip_domains.emplace("page_migration");
_skip_domains.emplace("scratch_memory");
_add_operation_settings(
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
callback_operation_option_names);
for(const auto& itr : callback_tracing_info)
_add_operation_settings(itr.name, itr, callback_operation_option_names);
for(const auto& itr : buffered_tracing_info)
_add_operation_settings(itr.name, itr, buffered_operation_option_names);
}
std::unordered_set<rocprofiler_callback_tracing_kind_t>
get_callback_domains()
{
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
const auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
};
auto _data = std::unordered_set<rocprofiler_callback_tracing_kind_t>{};
auto _domains =
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
.value_or(std::string{}),
" ,;:\t\n");
const auto valid_choices =
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
auto invalid_domain = [&valid_choices](const auto& domainv) {
return !std::any_of(valid_choices.begin(), valid_choices.end(),
[&domainv](const auto& aitr) { return (aitr == domainv); });
};
for(const auto& itr : _domains)
{
if(invalid_domain(itr))
{
ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n",
itr.c_str());
}
if(itr == "hsa_api")
{
for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API })
_data.emplace(eitr);
}
else if(itr == "hip_api")
{
for(auto eitr : { ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API })
_data.emplace(eitr);
}
else if(itr == "marker_api" || itr == "roctx")
{
_data.emplace(ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API);
}
else
{
for(size_t idx = 0; idx < callback_tracing_info.size(); ++idx)
{
auto ditr = callback_tracing_info[idx];
auto dval = static_cast<rocprofiler_callback_tracing_kind_t>(idx);
if(itr == to_lower(ditr.name) && supported.count(dval) > 0)
{
_data.emplace(dval);
break;
}
}
}
}
return _data;
}
std::unordered_set<rocprofiler_buffer_tracing_kind_t>
get_buffered_domains()
{
const auto buffer_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
const auto supported = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION,
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
};
auto _data = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{};
auto _domains =
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
.value_or(std::string{}),
" ,;:\t\n");
const auto valid_choices =
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
auto invalid_domain = [&valid_choices](const auto& domainv) {
return !std::any_of(valid_choices.begin(), valid_choices.end(),
[&domainv](const auto& aitr) { return (aitr == domainv); });
};
for(const auto& itr : _domains)
{
if(invalid_domain(itr))
{
ROCPROFSYS_THROW("unsupported ROCPROFSYS_ROCM_DOMAINS value: %s\n",
itr.c_str());
}
if(itr == "hsa_api")
{
for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API })
_data.emplace(eitr);
}
else if(itr == "hip_api")
{
for(auto eitr : { ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API })
_data.emplace(eitr);
}
else if(itr == "marker_api" || itr == "roctx")
{
_data.emplace(ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API);
}
else
{
for(size_t idx = 0; idx < buffer_tracing_info.size(); ++idx)
{
auto ditr = buffer_tracing_info[idx];
auto dval = static_cast<rocprofiler_buffer_tracing_kind_t>(idx);
if(itr == to_lower(ditr.name) && supported.count(dval) > 0)
{
_data.emplace(dval);
break;
}
}
}
}
return _data;
}
std::vector<std::string>
get_rocm_events()
{
return tim::delimit(
get_setting_value<std::string>("ROCPROFSYS_ROCM_EVENTS").value_or(std::string{}),
" ,;\t\n");
}
std::vector<int32_t>
get_operations(rocprofiler_callback_tracing_kind_t kindv)
{
ROCPROFSYS_CONDITIONAL_ABORT_F(
callback_operation_option_names.count(kindv) == 0,
"callback_operation_operation_names does not have value for %i\n", kindv);
auto _complete = get_operations_impl(kindv);
auto _include = get_operations_impl(
kindv, callback_operation_option_names.at(kindv).operations_include);
auto _exclude = get_operations_impl(
kindv, callback_operation_option_names.at(kindv).operations_exclude);
return get_operations_impl(_complete, _include, _exclude);
}
std::vector<int32_t>
get_operations(rocprofiler_buffer_tracing_kind_t kindv)
{
ROCPROFSYS_CONDITIONAL_ABORT_F(
buffered_operation_option_names.count(kindv) == 0,
"buffered_operation_option_names does not have value for %i\n", kindv);
auto _complete = get_operations_impl(kindv);
auto _include = get_operations_impl(
kindv, buffered_operation_option_names.at(kindv).operations_include);
auto _exclude = get_operations_impl(
kindv, buffered_operation_option_names.at(kindv).operations_exclude);
return get_operations_impl(_complete, _include, _exclude);
}
std::unordered_set<int32_t>
get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
{
ROCPROFSYS_CONDITIONAL_ABORT_F(
callback_operation_option_names.count(kindv) == 0,
"callback_operation_operation_names does not have value for %i\n", kindv);
auto _data = get_operations_impl(
kindv, callback_operation_option_names.at(kindv).operations_annotate_backtrace);
auto _ret = std::unordered_set<int32_t>{};
_ret.reserve(_data.size());
for(auto itr : _data)
_ret.emplace(itr);
return _ret;
}
std::unordered_set<int32_t>
get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv)
{
ROCPROFSYS_CONDITIONAL_ABORT_F(
buffered_operation_option_names.count(kindv) == 0,
"buffered_operation_option_names does not have value for %i\n", kindv);
auto _data = get_operations_impl(
kindv, buffered_operation_option_names.at(kindv).operations_annotate_backtrace);
auto _ret = std::unordered_set<int32_t>{};
_ret.reserve(_data.size());
for(auto itr : _data)
_ret.emplace(itr);
return _ret;
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
#else
namespace rocprofsys
{
namespace rocprofiler_sdk
{
void
config_settings(const std::shared_ptr<settings>&)
{}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
#endif
+70
ファイルの表示
@@ -0,0 +1,70 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/timemory.hpp"
#if defined(ROCPROFSYS_USE_ROCM)
# include <rocprofiler-sdk/fwd.h>
# include <rocprofiler-sdk/rocprofiler.h>
#endif
#include <cstdint>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler_sdk
{
void
config_settings(const std::shared_ptr<settings>&);
#if defined(ROCPROFSYS_USE_ROCM)
std::unordered_set<rocprofiler_callback_tracing_kind_t>
get_callback_domains();
std::unordered_set<rocprofiler_buffer_tracing_kind_t>
get_buffered_domains();
std::vector<int32_t>
get_operations(rocprofiler_callback_tracing_kind_t kindv);
std::vector<int32_t>
get_operations(rocprofiler_buffer_tracing_kind_t kindv);
std::vector<std::string>
get_rocm_events();
std::unordered_set<int32_t>
get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv);
std::unordered_set<int32_t>
get_backtrace_operations(rocprofiler_buffer_tracing_kind_t kindv);
#endif
} // namespace rocprofiler_sdk
} // namespace rocprofsys
+4 -2
ファイルの表示
@@ -21,6 +21,7 @@
// SOFTWARE.
#include "state.hpp"
#include "common/static_object.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "utility.hpp"
@@ -35,8 +36,9 @@ namespace
auto&
get_state_value()
{
static auto _v = std::atomic<State>{ State::PreInit };
return _v;
static auto*& _v = common::static_object<std::atomic<State>>::construct(
common::do_not_destroy{}, State::PreInit);
return *_v;
}
ThreadState&
+9
ファイルの表示
@@ -74,6 +74,15 @@ get_reserved_vector(size_t _n)
return _v;
}
/// returns a vector with a preallocated buffer
template <typename... Tp>
inline decltype(auto)
get_reserved_vector(std::vector<Tp...>&& _v, size_t _n)
{
_v.reserve(_n);
return std::forward<std::vector<Tp...>>(_v);
}
template <typename Tp, size_t Offset>
struct offset_index_sequence;
+2 -1
ファイルの表示
@@ -25,7 +25,8 @@ target_include_directories(
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocprof-sys-user>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/../rocprof-sys>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
PRIVATE ${rocprofiler-sdk_INCLUDE_DIR})
target_link_libraries(
rocprofiler-systems-dl-library
PUBLIC $<BUILD_INTERFACE:${dl_LIBRARY}>
+134 -55
ファイルの表示
@@ -54,6 +54,14 @@
#include <thread>
#include <unistd.h>
#if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0
#endif
#if ROCPROFSYS_USE_ROCM > 0
# include <rocprofiler-sdk/registration.h>
#endif
//--------------------------------------------------------------------------------------//
#define ROCPROFSYS_DLSYM(VARNAME, HANDLE, FUNCNAME) \
@@ -79,6 +87,7 @@
//--------------------------------------------------------------------------------------//
using main_func_t = int (*)(int, char**, char**);
using init_func_t = void (*)(void);
std::ostream&
operator<<(std::ostream& _os, const SpaceHandle& _handle)
@@ -360,14 +369,8 @@ struct ROCPROFSYS_INTERNAL_API indirect
ROCPROFSYS_DLSYM(kokkosp_dual_view_modify_f, m_omnihandle,
"kokkosp_dual_view_modify");
#if ROCPROFSYS_USE_ROCTRACER > 0
ROCPROFSYS_DLSYM(hsa_on_load_f, m_omnihandle, "OnLoad");
ROCPROFSYS_DLSYM(hsa_on_unload_f, m_omnihandle, "OnUnload");
#endif
#if ROCPROFSYS_USE_ROCPROFILER > 0
ROCPROFSYS_DLSYM(rocp_on_load_tool_prop_f, m_omnihandle, "OnLoadToolProp");
ROCPROFSYS_DLSYM(rocp_on_unload_tool_f, m_omnihandle, "OnUnloadTool");
#if ROCPROFSYS_USE_ROCM > 0
ROCPROFSYS_DLSYM(rocprofiler_configure_f, m_omnihandle, "rocprofiler_configure");
#endif
#if ROCPROFSYS_USE_OMPT == 0
@@ -460,16 +463,9 @@ public:
void (*kokkosp_dual_view_sync_f)(const char*, const void* const, bool) = nullptr;
void (*kokkosp_dual_view_modify_f)(const char*, const void* const, bool) = nullptr;
// HSA functions
#if ROCPROFSYS_USE_ROCTRACER > 0
bool (*hsa_on_load_f)(HsaApiTable*, uint64_t, uint64_t, const char* const*) = nullptr;
void (*hsa_on_unload_f)() = nullptr;
#endif
// ROCP functions
#if ROCPROFSYS_USE_ROCPROFILER > 0
void (*rocp_on_load_tool_prop_f)(void* settings) = nullptr;
void (*rocp_on_unload_tool_f)() = nullptr;
#if ROCPROFSYS_USE_ROCM > 0
rocprofiler_tool_configure_result_t* (*rocprofiler_configure_f)(
uint32_t, const char*, uint32_t, rocprofiler_client_id_t*) = nullptr;
#endif
// OpenMP functions
@@ -644,13 +640,18 @@ extern "C"
bool _invoked = false;
ROCPROFSYS_DL_INVOKE_STATUS(_invoked, get_indirect().rocprofsys_init_f, a, b, c);
if(_invoked)
{
dl::get_active() = true;
dl::get_inited() = true;
dl::_rocprofsys_dl_verbose = dl::get_rocprofsys_dl_env();
if(dl::get_instrumented() < dl::InstrumentMode::PythonProfile)
if(dl::get_instrumented() >= dl::InstrumentMode::None &&
dl::get_instrumented() < dl::InstrumentMode::PythonProfile)
{
dl::rocprofsys_postinit((c) ? std::string{ c } : std::string{});
}
}
}
@@ -1069,43 +1070,17 @@ extern "C"
//----------------------------------------------------------------------------------//
//
// HSA
// ROCm
//
//----------------------------------------------------------------------------------//
#if ROCPROFSYS_USE_ROCTRACER > 0
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names)
#if ROCPROFSYS_USE_ROCM > 0
rocprofiler_tool_configure_result_t* rocprofiler_configure(
uint32_t version, const char* runtime_version, uint32_t priority,
rocprofiler_client_id_t* client_id)
{
return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_load_f, table, runtime_version,
failed_tool_count, failed_tool_names);
}
void OnUnload() { return ROCPROFSYS_DL_INVOKE(get_indirect().hsa_on_unload_f); }
#endif
//----------------------------------------------------------------------------------//
//
// ROCP
//
//----------------------------------------------------------------------------------//
#if ROCPROFSYS_USE_ROCPROFILER > 0
void OnLoadToolProp(void* settings)
{
ROCPROFSYS_DL_LOG(
-16,
"invoking %s(rocprofiler_settings_t*) within librocprof-sys-dl.so "
"will cause a silent failure for rocprofiler. ROCP_TOOL_LIB "
"should be set to librocprof-sys.so\n",
__FUNCTION__);
abort();
return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_load_tool_prop_f, settings);
}
void OnUnloadTool()
{
return ROCPROFSYS_DL_INVOKE(get_indirect().rocp_on_unload_tool_f);
return ROCPROFSYS_DL_INVOKE(get_indirect().rocprofiler_configure_f, version,
runtime_version, priority, client_id);
}
#endif
@@ -1227,7 +1202,9 @@ rocprofsys_preinit()
void
rocprofsys_postinit(std::string _exe)
{
switch(get_instrumented())
InstrumentMode instrumentMode = get_instrumented();
switch(instrumentMode)
{
case InstrumentMode::None:
case InstrumentMode::BinaryRewrite:
@@ -1393,20 +1370,122 @@ verify_instrumented_preloaded()
bool _handle_preload = rocprofsys_preload();
main_func_t main_real = nullptr;
init_func_t init_real = nullptr;
} // namespace
} // namespace dl
} // namespace rocprofsys
extern "C"
{
void rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API;
int rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API;
void rocprofsys_set_main_init(init_func_t) ROCPROFSYS_INTERNAL_API;
void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API;
void rocprofsys_set_main_init(init_func_t _init_real)
{
::rocprofsys::dl::init_real = _init_real;
}
void rocprofsys_set_main(main_func_t _main_real)
{
::rocprofsys::dl::main_real = _main_real;
}
// void rocprofsys_main_init(int argc, char** argv, char** envp)
// {
// ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
// using ::rocprofsys::common::get_env;
// using ::rocprofsys::dl::get_default_mode;
// // prevent re-entry
// static int _reentry = 0;
// if(_reentry > 0) return -1;
// _reentry = 1;
// int ret = 0;
// if(::rocprofsys::dl::init_real)
// {
// if(envp)
// {
// size_t _idx = 0;
// while(envp[_idx] != nullptr)
// {
// auto _env_v = std::string_view{ envp[_idx++] };
// if(_env_v.find("ROCPROFSYS") != 0 &&
// _env_v.find("librocprof-sys") == std::string_view::npos)
// continue;
// auto _pos = _env_v.find('=');
// if(_pos < _env_v.length())
// {
// auto _var = std::string{ _env_v }.substr(0, _pos);
// auto _val = std::string{ _env_v }.substr(_pos + 1);
// ROCPROFSYS_DL_LOG(1, "%s(%s, %s)\n", "rocprofsys_set_env",
// _var.c_str(), _val.c_str());
// setenv(_var.c_str(), _val.c_str(), 0);
// }
// }
// }
// ret = (*::rocprofsys::dl::init_real)(argc, argv, envp);
// }
// else
// {
// ROCPROFSYS_DL_LOG(
// 0, "%s\n",
// "Unsuccessful wrapping of init: nullptr to real init function");
// }
// auto _mode = get_env("ROCPROFSYS_MODE", get_default_mode());
// rocprofsys_init(_mode.c_str(),
// dl::get_instrumented() == dl::InstrumentMode::BinaryRewrite,
// argv[0]);
// return ret;
// }
// int rocprofsys_main(int argc, char** argv, char** envp)
// {
// ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
// // prevent re-entry
// static int _reentry = 0;
// if(_reentry > 0) return -1;
// _reentry = 1;
// if(!::rocprofsys::dl::main_real)
// throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main:
// "
// "nullptr to real main function");
// rocprofsys_push_trace(basename(argv[0]));
// int ret = (*::rocprofsys::dl::main_real)(argc, argv, envp);
// rocprofsys_pop_trace(basename(argv[0]));
// rocprofsys_finalize();
// return ret;
// }
void rocprofsys_main_init(void)
{
ROCPROFSYS_DL_LOG(0, "[%s].\n", __FUNCTION__);
if(::rocprofsys::dl::init_real)
{
// Call real init function
(*::rocprofsys::dl::init_real)();
}
else
{
ROCPROFSYS_DL_LOG(
0, "Unsuccessful wrapping of init: real_init function is nullptr.\n");
}
}
int rocprofsys_main(int argc, char** argv, char** envp)
{
ROCPROFSYS_DL_LOG(0, "%s\n", __FUNCTION__);
@@ -1420,7 +1499,7 @@ extern "C"
if(!::rocprofsys::dl::main_real)
throw std::runtime_error("[rocprof-sys][dl] Unsuccessful wrapping of main: "
"nullptr to real main function");
"real_main function is nullptr.");
if(envp)
{
@@ -1455,4 +1534,4 @@ extern "C"
return ret;
}
}
} // extern "C"
+6 -18
ファイルの表示
@@ -53,12 +53,8 @@
# define ROCPROFSYS_USE_OMPT 0
#endif
#if !defined(ROCPROFSYS_USE_ROCTRACER)
# define ROCPROFSYS_USE_ROCTRACER 0
#endif
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
# define ROCPROFSYS_USE_ROCPROFILER 0
#if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0
#endif
//--------------------------------------------------------------------------------------//
@@ -177,20 +173,12 @@ extern "C"
const char*) ROCPROFSYS_PUBLIC_API;
# endif
# if ROCPROFSYS_USE_ROCTRACER > 0
// HSA
struct HsaApiTable;
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API;
void OnUnload() ROCPROFSYS_PUBLIC_API;
# if ROCPROFSYS_USE_ROCM > 0
struct rocprofiler_tool_configure_result_t;
struct rocprofiler_client_id_t;
# endif
# if ROCPROFSYS_USE_ROCPROFILER > 0
// ROCP
void OnLoadToolProp(void* settings) ROCPROFSYS_PUBLIC_API;
void OnUnloadTool() ROCPROFSYS_PUBLIC_API;
# endif
#endif
#endif // ROCPROFSYS_DL_SOURCE
}
namespace rocprofsys
+22 -11
ファイルの表示
@@ -37,20 +37,19 @@
// local type definitions
//
typedef int (*main_func_t)(int, char**, char**);
typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**,
int (*)(int, char**, char**), void (*)(void), void (*)(void),
void*);
typedef void (*init_func_t)(void);
typedef int (*start_main_t)(int (*)(int, char**, char**), int, char**, void (*)(void),
void (*)(void), void (*)(void), void*);
//
// local function declarations
//
int
rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**,
int (*)(int, char**, char**), void (*)(void), void (*)(void),
void*) ROCPROFSYS_INTERNAL_API;
rocprofsys_libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void),
void (*)(void), void (*)(void), void*) ROCPROFSYS_INTERNAL_API;
int
__libc_start_main(int (*)(int, char**, char**), int, char**, int (*)(int, char**, char**),
__libc_start_main(int (*)(int, char**, char**), int, char**, void (*)(void),
void (*)(void), void (*)(void), void*) ROCPROFSYS_PUBLIC_API;
//
@@ -79,12 +78,18 @@ basename(const char*);
extern void rocprofsys_set_main(main_func_t) ROCPROFSYS_INTERNAL_API;
extern void
rocprofsys_set_main_init(init_func_t func) ROCPROFSYS_INTERNAL_API;
extern void
rocprofsys_main_init(void) ROCPROFSYS_INTERNAL_API;
extern int
rocprofsys_main(int argc, char** argv, char** envp) ROCPROFSYS_INTERNAL_API;
int
rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv,
int (*_init)(int, char**, char**), void (*_fini)(void),
void (*_init)(void), void (*_fini)(void),
void (*_rtld_fini)(void), void* _stack_end)
{
int _preload = rocprofsys_preload_library();
@@ -97,8 +102,9 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
// get the address of this function
void* _this_func = __builtin_return_address(0);
// Save the real main function address
// Save the real main function addresses
rocprofsys_set_main(_main);
rocprofsys_set_main_init(_init);
// Find the real __libc_start_main()
start_main_t user_main = dlsym(RTLD_NEXT, "__libc_start_main");
@@ -115,6 +121,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
}
else
{
// return user_main(rocprofsys_main, _argc, _argv,
// rocprofsys_main_init, _fini,
// _rtld_fini, _stack_end);
// call rocprof-sys main function wrapper
return user_main(rocprofsys_main, _argc, _argv, _init, _fini, _rtld_fini,
_stack_end);
@@ -129,9 +139,10 @@ rocprofsys_libc_start_main(int (*_main)(int, char**, char**), int _argc, char**
int
__libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv,
int (*_init)(int, char**, char**), void (*_fini)(void),
void (*_rtld_fini)(void), void* _stack_end)
void (*_init)(void), void (*_fini)(void), void (*_rtld_fini)(void),
void* _stack_end)
{
// intercept the main function
return rocprofsys_libc_start_main(_main, _argc, _argv, _init, _fini, _rtld_fini,
_stack_end);
}
@@ -43,19 +43,21 @@ extern "C"
ROCPROFSYS_CATEGORY_PYTHON,
ROCPROFSYS_CATEGORY_USER,
ROCPROFSYS_CATEGORY_HOST,
ROCPROFSYS_CATEGORY_DEVICE_HIP,
ROCPROFSYS_CATEGORY_DEVICE_HSA,
ROCPROFSYS_CATEGORY_ROCM_HIP,
ROCPROFSYS_CATEGORY_ROCM_HSA,
ROCPROFSYS_CATEGORY_ROCM_ROCTX,
ROCPROFSYS_CATEGORY_ROCM,
ROCPROFSYS_CATEGORY_ROCM_HIP_API,
ROCPROFSYS_CATEGORY_ROCM_HSA_API,
ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH,
ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY,
ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY,
ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION,
ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION,
ROCPROFSYS_CATEGORY_ROCM_MARKER_API,
ROCPROFSYS_CATEGORY_ROCM_SMI,
ROCPROFSYS_CATEGORY_ROCM_SMI_BUSY,
ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP,
ROCPROFSYS_CATEGORY_ROCM_SMI_POWER,
ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE,
ROCPROFSYS_CATEGORY_ROCM_RCCL,
ROCPROFSYS_CATEGORY_ROCTRACER,
ROCPROFSYS_CATEGORY_ROCPROFILER,
ROCPROFSYS_CATEGORY_SAMPLING,
ROCPROFSYS_CATEGORY_PTHREAD,
ROCPROFSYS_CATEGORY_KOKKOS,
+13 -32
ファイルの表示
@@ -26,6 +26,7 @@
//
#include "api.hpp"
#include "common/setup.hpp"
#include "common/static_object.hpp"
#include "core/categories.hpp"
#include "core/components/fwd.hpp"
#include "core/concepts.hpp"
@@ -46,13 +47,12 @@
#include "library/components/mpi_gotcha.hpp"
#include "library/components/numa_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/rocprofiler.hpp"
#include "library/coverage.hpp"
#include "library/ompt.hpp"
#include "library/process_sampler.hpp"
#include "library/ptl.hpp"
#include "library/rcclp.hpp"
#include "library/rocprofiler.hpp"
#include "library/rocprofiler-sdk.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
@@ -399,10 +399,6 @@ rocprofsys_init_library_hidden()
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", _debug_value);
} };
tim::trait::runtime_enabled<comp::roctracer>::set(get_use_roctracer());
tim::trait::runtime_enabled<comp::roctracer_data>::set(get_use_roctracer() &&
get_use_timemory());
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
}
@@ -718,13 +714,6 @@ rocprofsys_finalize_hidden(void)
}
}
if(get_use_roctracer())
{
ROCPROFSYS_VERBOSE_F(1, "Flushing roctracer...\n");
// ensure that roctracer is flushed before setting the state to finalized
comp::roctracer::flush();
}
set_state(State::Finalized);
push_enable_sampling_on_child_threads(false);
@@ -785,6 +774,14 @@ rocprofsys_finalize_hidden(void)
ompt::shutdown();
}
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
// TODO: option for rocm
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down ROCm...\n");
rocprofiler_sdk::shutdown();
}
#endif
ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
@@ -835,24 +832,6 @@ rocprofsys_finalize_hidden(void)
process_sampler::shutdown();
}
if(get_use_roctracer())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down roctracer...\n");
// ensure that threads running roctracer callbacks shutdown
comp::roctracer::shutdown();
// join extra thread(s) used by roctracer
ROCPROFSYS_VERBOSE_F(2, "Waiting on roctracer tasks...\n");
tasking::join();
}
if(get_use_rocprofiler())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down rocprofiler...\n");
rocprofiler::post_process();
rocprofiler::rocm_cleanup();
}
if(get_use_causal())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down causal sampling...\n");
@@ -919,7 +898,7 @@ rocprofsys_finalize_hidden(void)
process_sampler::post_process();
}
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
// shutdown tasking before timemory is finalized
ROCPROFSYS_VERBOSE_F(1, "Shutting down thread-pools...\n");
tasking::shutdown();
@@ -991,6 +970,8 @@ rocprofsys_finalize_hidden(void)
tim::signals::enable_signal_detection(
{ tim::signals::sys_signal::SegFault, tim::signals::sys_signal::Stop },
[](int) {});
common::destroy_static_objects();
}
//======================================================================================//
+6 -21
ファイルの表示
@@ -23,8 +23,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/rocm.hpp
${CMAKE_CURRENT_LIST_DIR}/rocm_smi.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp
${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
${CMAKE_CURRENT_LIST_DIR}/runtime.hpp
${CMAKE_CURRENT_LIST_DIR}/sampling.hpp
${CMAKE_CURRENT_LIST_DIR}/thread_data.hpp
@@ -35,37 +34,23 @@ set(library_headers
target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources}
${library_headers})
if(ROCPROFSYS_USE_ROCTRACER OR ROCPROFSYS_USE_ROCPROFILER)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp)
endif()
if(ROCPROFSYS_USE_ROCTRACER)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp)
endif()
if(ROCPROFSYS_USE_RCCL)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
endif()
if(ROCPROFSYS_USE_ROCPROFILER)
if(ROCPROFSYS_USE_ROCM)
target_sources(
rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp)
endif()
if(ROCPROFSYS_USE_ROCM_SMI)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp)
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp
${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp)
add_subdirectory(rocprofiler-sdk)
endif()
add_subdirectory(causal)
add_subdirectory(components)
add_subdirectory(coverage)
add_subdirectory(rocm)
add_subdirectory(tracing)
set(ndebug_sources
@@ -28,8 +28,6 @@ set(component_headers
${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp
${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp)
@@ -37,16 +35,6 @@ set(component_headers
target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources}
${component_headers})
if(ROCPROFSYS_USE_ROCPROFILER)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp)
endif()
if(ROCPROFSYS_USE_ROCTRACER)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp)
endif()
if(ROCPROFSYS_USE_RCCL)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
@@ -64,13 +64,13 @@ using tim::type_list;
// these categories increment push/pop counts, which are used for sanity checks since
// they should ALWAYS be popped if they were pushed
using tracing_count_categories_t =
type_list<category::host, category::mpi, category::pthread, category::rocm_hip,
category::rocm_hsa, category::rocm_rccl>;
type_list<category::host, category::mpi, category::pthread, category::rocm_hip_api,
category::rocm_hsa_api, category::rocm_rccl>;
// convert these categories to throughput points
using causal_throughput_categories_t =
type_list<category::host, category::kokkos, category::ompt, category::rocm_hip,
category::rocm_hsa, category::rocm_rccl, category::rocm_roctx>;
type_list<category::host, category::kokkos, category::ompt, category::rocm_hip_api,
category::rocm_hsa_api, category::rocm_rccl, category::rocm_marker_api>;
// define this outside of category region functions so that the
// static thread_local is global instead of per-template instantiation
@@ -28,7 +28,6 @@
#include "core/utility.hpp"
#include "library/causal/delay.hpp"
#include "library/components/category_region.hpp"
#include "library/components/roctracer.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
@@ -61,7 +60,7 @@ shutdown();
namespace component
{
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
using bundle_t = tim::lightweight_tuple<comp::wall_clock>;
using category_region_t = tim::lightweight_tuple<category_region<category::pthread>>;
namespace
@@ -82,7 +81,6 @@ inline void
start_bundle(bundle_t& _bundle, int64_t _tid, Args&&... _args)
{
if(!get_use_timemory() && !get_use_perfetto()) return;
trait::runtime_enabled<comp::roctracer_data>::set(get_use_roctracer());
ROCPROFSYS_BASIC_VERBOSE_F(3, "starting bundle '%s' in thread %li...\n",
_bundle.key().c_str(), _tid);
if constexpr(sizeof...(Args) > 0)
@@ -619,5 +617,3 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
}
} // namespace component
} // namespace rocprofsys
TIMEMORY_INITIALIZE_STORAGE(component::roctracer_data)
@@ -1,193 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/rocprofiler.hpp"
#include "core/common.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/defines.hpp"
#include "core/dynamic_library.hpp"
#include "core/perfetto.hpp"
#include "core/redirect.hpp"
#include "library/rocprofiler.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include <timemory/storage/types.hpp>
#include <timemory/utility/types.hpp>
#include <timemory/variadic/functional.hpp>
#include <timemory/variadic/lightweight_tuple.hpp>
#include <rocprofiler.h>
#include <cstdint>
#include <string_view>
#include <type_traits>
namespace rocprofsys
{
namespace component
{
namespace
{
auto&
rocprofiler_activity_count()
{
static std::atomic<int64_t> _v{ 0 };
return _v;
}
} // namespace
unique_ptr_t<rocm_data_t>&
rocm_data(int64_t _tid)
{
using thread_data_t = thread_data<rocm_data_t, rocm_event>;
return thread_data_t::instance(construct_on_thread{ _tid });
}
rocm_event::rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue,
std::string _event_name, rocm_metric_type _begin,
rocm_metric_type _end, uint32_t _feature_count, void* _features_v)
: device_id{ _dev }
, thread_id{ _thr }
, queue_id{ _queue }
, entry{ _begin }
, exit{ _end }
, name(std::move(_event_name))
{
feature_values.reserve(_feature_count);
feature_names.reserve(_feature_count);
auto* _features = static_cast<rocprofiler_feature_t*>(_features_v);
for(uint32_t i = 0; i < _feature_count; ++i)
{
const rocprofiler_feature_t* p = &_features[i];
feature_names.emplace_back(i);
switch(p->data.kind)
{
// Output metrics results
case ROCPROFILER_DATA_KIND_UNINIT: break;
case ROCPROFILER_DATA_KIND_BYTES:
feature_values.emplace_back(
rocm_feature_value{ p->data.result_bytes.size });
break;
case ROCPROFILER_DATA_KIND_INT32:
feature_values.emplace_back(rocm_feature_value{ p->data.result_int32 });
break;
case ROCPROFILER_DATA_KIND_FLOAT:
feature_values.emplace_back(rocm_feature_value{ p->data.result_float });
break;
case ROCPROFILER_DATA_KIND_DOUBLE:
feature_values.emplace_back(rocm_feature_value{ p->data.result_double });
break;
case ROCPROFILER_DATA_KIND_INT64:
feature_values.emplace_back(rocm_feature_value{ p->data.result_int64 });
break;
}
}
}
std::string
rocm_event::as_string() const
{
std::stringstream _ss{};
_ss << name << ", device: " << device_id << ", queue: " << queue_id
<< ", thread: " << thread_id << ", entry: " << entry << ", exit = " << exit;
_ss.precision(3);
_ss << std::fixed;
for(size_t i = 0; i < feature_names.size(); ++i)
{
auto _name = rocprofsys::rocprofiler::get_data_labels().at(device_id).at(
feature_names.at(i));
_ss << ", " << _name << " = ";
auto _as_string = [&_ss](auto&& itr) { _ss << std::setw(4) << itr; };
std::visit(_as_string, feature_values.at(i));
}
return _ss.str();
}
void
rocprofiler::preinit()
{
rocprofiler_data::label() = "rocprofiler";
rocprofiler_data::description() = "ROCm hardware counters";
}
void
rocprofiler::start()
{
if(tracker_type::start() == 0) setup();
}
void
rocprofiler::stop()
{
if(tracker_type::stop() == 0) shutdown();
}
bool
rocprofiler::is_setup()
{
return rocprofsys::rocprofiler::is_setup();
}
void
rocprofiler::add_setup(const std::string&, std::function<void()>&&)
{}
void
rocprofiler::add_shutdown(const std::string&, std::function<void()>&&)
{}
void
rocprofiler::remove_setup(const std::string&)
{}
void
rocprofiler::remove_shutdown(const std::string&)
{}
void
rocprofiler::setup()
{
ROCPROFSYS_VERBOSE_F(1, "rocprofiler is setup\n");
}
void
rocprofiler::shutdown()
{
rocprofsys::rocprofiler::post_process();
rocprofsys::rocprofiler::rocm_cleanup();
ROCPROFSYS_VERBOSE_F(1, "rocprofiler is shutdown\n");
}
scope::transient_destructor
rocprofiler::protect_flush_activity()
{
return scope::transient_destructor([]() { --rocprofiler_activity_count(); },
[]() { ++rocprofiler_activity_count(); });
}
} // namespace component
} // namespace rocprofsys
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler, false, void)
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(rocprofiler_data, true,
tim::component::rocprofiler_value)
@@ -1,241 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include "library/thread_data.hpp"
#include <timemory/api.hpp>
#include <timemory/backends/hardware_counters.hpp>
#include <timemory/components/base.hpp>
#include <timemory/components/data_tracker/components.hpp>
#include <timemory/components/macros.hpp>
#include <timemory/enum.h>
#include <timemory/macros.hpp>
#include <timemory/macros/os.hpp>
#include <timemory/mpl/concepts.hpp>
#include <timemory/mpl/macros.hpp>
#include <timemory/mpl/type_traits.hpp>
#include <timemory/mpl/types.hpp>
#include <timemory/utility/transient_function.hpp>
#include <array>
#include <cstdint>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
namespace rocprofsys
{
namespace component
{
using rocm_metric_type = unsigned long long;
using rocm_info_entry = ::tim::hardware_counters::info;
using rocm_feature_value = std::variant<uint32_t, float, uint64_t, double>;
struct rocm_counter
{
std::array<rocm_metric_type, ROCPROFSYS_ROCM_MAX_COUNTERS> counters;
};
struct rocm_event
{
using value_type = rocm_feature_value;
uint32_t device_id = 0;
uint32_t thread_id = 0;
uint32_t queue_id = 0;
rocm_metric_type entry = 0;
rocm_metric_type exit = 0;
std::string name = {};
std::vector<size_t> feature_names = {};
std::vector<rocm_feature_value> feature_values = {};
rocm_event() = default;
rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue, std::string _event_name,
rocm_metric_type begin, rocm_metric_type end, uint32_t _feature_count,
void* _features);
std::string as_string() const;
friend std::ostream& operator<<(std::ostream& _os, const rocm_event& _v)
{
return (_os << _v.as_string());
}
friend bool operator<(const rocm_event& _lhs, const rocm_event& _rhs)
{
return std::tie(_lhs.device_id, _lhs.queue_id, _lhs.entry, _lhs.thread_id) <
std::tie(_rhs.device_id, _rhs.queue_id, _rhs.entry, _rhs.thread_id);
}
};
using rocm_data_t = std::vector<rocm_event>;
using rocm_data_tracker = data_tracker<rocm_feature_value, rocm_event>;
rocprofsys::unique_ptr_t<rocm_data_t>&
rocm_data(int64_t _tid = threading::get_id());
using rocprofiler_value = typename rocm_event::value_type;
using rocprofiler_data = data_tracker<rocprofiler_value, rocprofiler>;
struct rocprofiler
: base<rocprofiler, void>
, private policy::instance_tracker<rocprofiler, false>
{
using value_type = void;
using base_type = base<rocprofiler, void>;
using tracker_type = policy::instance_tracker<rocprofiler, false>;
ROCPROFSYS_DEFAULT_OBJECT(rocprofiler)
static void preinit();
static void global_init() { setup(); }
static void global_finalize() { shutdown(); }
static bool is_setup();
static void setup();
static void shutdown();
static void add_setup(const std::string&, std::function<void()>&&);
static void add_shutdown(const std::string&, std::function<void()>&&);
static void remove_setup(const std::string&);
static void remove_shutdown(const std::string&);
void start();
void stop();
// this function protects rocprofiler_flush_activty from being called
// when rocprof-sys exits during a callback
[[nodiscard]] static scope::transient_destructor protect_flush_activity();
};
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
inline void
rocprofiler::setup()
{}
inline void
rocprofiler::shutdown()
{}
inline bool
rocprofiler::is_setup()
{
return false;
}
#endif
} // namespace component
} // namespace rocprofsys
namespace tim
{
namespace component
{
using ::rocprofsys::component::rocm_data_tracker;
using ::rocprofsys::component::rocm_feature_value;
using ::rocprofsys::component::rocprofiler_data;
using ::rocprofsys::component::rocprofiler_value;
} // namespace component
} // namespace tim
namespace tim
{
namespace operation
{
template <>
struct set_storage<component::rocm_data_tracker>
{
using T = component::rocm_data_tracker;
static constexpr size_t max_threads = 4096;
using type = T;
using storage_array_t = std::array<storage<type>*, max_threads>;
friend struct get_storage<component::rocm_data_tracker>;
ROCPROFSYS_DEFAULT_OBJECT(set_storage)
auto operator()(storage<type>*, size_t) const {}
auto operator()(type&, size_t) const {}
auto operator()(storage<type>* _v) const { get().fill(_v); }
private:
static storage_array_t& get()
{
static storage_array_t _v = { nullptr };
return _v;
}
};
template <>
struct get_storage<component::rocm_data_tracker>
{
using type = component::rocm_data_tracker;
ROCPROFSYS_DEFAULT_OBJECT(get_storage)
auto operator()(const type&) const
{
return operation::set_storage<type>::get().at(0);
}
auto operator()() const
{
type _obj{};
return (*this)(_obj);
}
auto operator()(size_t _idx) const
{
return operation::set_storage<type>::get().at(_idx);
}
auto operator()(type&, size_t _idx) const { return (*this)(_idx); }
};
} // namespace operation
} // namespace tim
#if !defined(ROCPROFSYS_USE_ROCPROFILER)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler_data, false_type)
#endif
TIMEMORY_SET_COMPONENT_API(component::rocprofiler_data, project::timemory,
category::timing, os::supports_unix)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::rocprofiler_data,
false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::rocprofiler_data,
false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocprofiler_data, false_type)
TIMEMORY_STATISTICS_TYPE(component::rocprofiler_data, component::rocprofiler_value)
TIMEMORY_STATISTICS_TYPE(component::rocm_data_tracker, component::rocm_feature_value)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_units, component::rocm_data_tracker, false_type)
#if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
# include <timemory/operations.hpp>
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler, false, void)
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(rocprofiler_data, true, double)
#endif
-396
ファイルの表示
@@ -1,396 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/roctracer.hpp"
#include "core/common.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/defines.hpp"
#include "core/dynamic_library.hpp"
#include "core/redirect.hpp"
#include "library/roctracer.hpp"
#include "library/runtime.hpp"
#include "library/thread_data.hpp"
#include "library/thread_info.hpp"
#include <chrono>
#include <roctracer.h>
#define HIP_PROF_HIP_API_STRING 1
#include <roctracer_ext.h>
#include <roctracer_hip.h>
#if ROCPROFSYS_HIP_VERSION < 50300
# include <roctracer_hcc.h>
#endif
#define AMD_INTERNAL_BUILD 1
#include <roctracer_hsa.h>
namespace rocprofsys
{
namespace component
{
namespace
{
auto&
roctracer_activity_count()
{
static std::atomic<int64_t> _v{ 0 };
return _v;
}
} // namespace
void
roctracer::preinit()
{
roctracer_data::label() = "roctracer";
roctracer_data::description() = "ROCm tracer (activity API)";
}
void
roctracer::start()
{
if(tracker_type::start() == 0) setup(nullptr);
}
void
roctracer::stop()
{
if(tracker_type::stop() == 0) shutdown();
}
bool
roctracer::is_setup()
{
return roctracer_is_setup();
}
void
roctracer::add_setup(const std::string& _lbl, std::function<void()>&& _func)
{
roctracer_setup_routines().emplace_back(_lbl, std::move(_func));
}
void
roctracer::add_shutdown(const std::string& _lbl, std::function<void()>&& _func)
{
roctracer_shutdown_routines().emplace_back(_lbl, std::move(_func));
}
void
roctracer::remove_setup(const std::string& _lbl)
{
auto& _data = roctracer_setup_routines();
for(auto itr = _data.begin(); itr != _data.end(); ++itr)
{
if(itr->first == _lbl)
{
_data.erase(itr);
break;
}
}
}
void
roctracer::remove_shutdown(const std::string& _lbl)
{
auto& _data = roctracer_setup_routines();
for(auto itr = _data.begin(); itr != _data.end(); ++itr)
{
if(itr->first == _lbl)
{
_data.erase(itr);
break;
}
}
}
void
roctracer::setup(void* table, bool on_load_trace)
{
if(!get_use_roctracer()) return;
auto_lock_t _lk{ type_mutex<roctracer>() };
if(roctracer_is_setup()) return;
roctracer_is_setup() = true;
ROCPROFSYS_VERBOSE_F(1, "setting up roctracer...\n");
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
dynamic_library _amdhip64{ "ROCPROFSYS_ROCTRACER_LIBAMDHIP64",
find_library_path("libamdhip64.so",
{ "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
{ ROCPROFSYS_DEFAULT_ROCM_PATH }) };
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR < 4
dynamic_library _kfdwrapper{
"ROCPROFSYS_ROCTRACER_LIBKFDWRAPPER",
find_library_path("libkfdwrapper64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
{ ROCPROFSYS_DEFAULT_ROCM_PATH },
{ "roctracer/lib", "roctracer/lib64", "lib", "lib64" })
};
#endif
ROCPROFSYS_ROCTRACER_CALL(roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr));
// Allocating tracing pool
roctracer_properties_t properties{};
memset(&properties, 0, sizeof(roctracer_properties_t));
// properties.mode = 0x1000;
properties.buffer_size = 0x100;
properties.buffer_callback_fun = hip_activity_callback;
ROCPROFSYS_ROCTRACER_CALL(roctracer_open_pool(&properties));
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4
// HIP 4.5.0 has an invalid warning
redirect _rd{ std::cerr, "roctracer_enable_callback(), get_op_end(), invalid domain "
"ID(4) in: roctracer_enable_callback(hip_api_callback, "
"nullptr)roctracer_enable_activity_expl(), get_op_end(), "
"invalid domain ID(4) in: roctracer_enable_activity()" };
#endif
if(get_trace_hip_api())
{
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr));
}
if(get_use_roctx())
{
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, nullptr));
}
if(get_trace_hip_activity())
{
// Enable HIP activity tracing
ROCPROFSYS_ROCTRACER_CALL(
roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS));
}
if(table != nullptr)
{
ROCPROFSYS_VERBOSE(1 || on_load_trace, "[OnLoad] setting up HSA...\n");
bool trace_hsa_api = get_trace_hsa_api();
// Enable HSA API callbacks/activity
if(trace_hsa_api)
{
std::vector<std::string> hsa_api_vec =
tim::delimit(get_trace_hsa_api_types());
// initialize HSA tracing
roctracer_set_properties(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), (void*) table);
if(!hsa_api_vec.empty())
{
for(const auto& itr : hsa_api_vec)
{
uint32_t cid = HSA_API_ID_NUMBER;
const char* api = itr.c_str();
ROCPROFSYS_ROCTRACER_CALL(roctracer_op_code(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), api,
&cid, nullptr));
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_callback(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API), cid,
hsa_api_callback, nullptr));
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace(%s)", api);
}
}
else
{
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-trace()\n");
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_domain_callback(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_API),
hsa_api_callback, nullptr));
}
}
bool trace_hsa_activity = get_trace_hsa_activity();
// Enable HSA GPU activity
if(trace_hsa_activity)
{
#if ROCPROFSYS_HIP_VERSION < 50300
using namespace roctracer;
// initialize HSA tracing
const char* output_prefix = nullptr;
hsa_ops_properties_t ops_properties{
table, reinterpret_cast<activity_async_callback_t>(hsa_activity_callback),
nullptr, output_prefix
};
#elif ROCPROFSYS_HIP_VERSION < 50301
hsa_ops_properties_t ops_properties;
ops_properties.table = table;
ops_properties.reserved1[0] = reinterpret_cast<void*>(&hsa_activity_callback);
ops_properties.reserved1[1] = nullptr;
ops_properties.reserved1[2] = nullptr;
#else
hsa_ops_properties_t ops_properties{
table, reinterpret_cast<void*>(&hsa_activity_callback), nullptr, nullptr
};
#endif
roctracer_set_properties(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_OPS), &ops_properties);
ROCPROFSYS_VERBOSE(1 || on_load_trace, " HSA-activity-trace()\n");
ROCPROFSYS_ROCTRACER_CALL(roctracer_enable_op_activity(
static_cast<activity_domain_t>(ACTIVITY_DOMAIN_HSA_OPS), HSA_OP_ID_COPY));
}
}
// callback for HSA
for(auto& itr : roctracer_setup_routines())
itr.second();
// make sure all async callbacks are allocated
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
hip_exec_activity_callbacks(i);
ROCPROFSYS_VERBOSE_F(1, "roctracer is setup\n");
}
void
roctracer::flush()
{
auto wait_for_activity_flush_completion = []() {
uint16_t nitr = 0;
while(roctracer_activity_count() > 0 && nitr++ < 10)
std::this_thread::sleep_for(std::chrono::milliseconds{ 100 });
};
// a flush may already be happening
wait_for_activity_flush_completion();
if(roctracer_activity_count() == 0)
{
ROCPROFSYS_VERBOSE_F(2, "executing roctracer_flush_activity()...\n");
ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity());
// wait to make sure flush completes
std::this_thread::sleep_for(std::chrono::milliseconds{ 100 });
wait_for_activity_flush_completion();
}
else
{
ROCPROFSYS_CI_FAIL(true,
"roctracer_activity_count() != 0 (== %li). "
"roctracer::shutdown() most likely called during abort",
roctracer_activity_count().load());
}
ROCPROFSYS_VERBOSE_F(2, "executing hip_exec_activity_callbacks(0..%zu)\n",
thread_info::get_peak_num_threads());
// make sure all async operations are executed
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
hip_exec_activity_callbacks(i);
ROCPROFSYS_VERBOSE_F(2, "roctracer flush completed\n");
}
void
roctracer::shutdown()
{
auto_lock_t _lk{ type_mutex<roctracer>() };
if(!roctracer_is_setup()) return;
roctracer_is_setup() = false;
ROCPROFSYS_VERBOSE_F(1, "shutting down roctracer...\n");
// callback for hsa
ROCPROFSYS_VERBOSE_F(2, "executing %zu roctracer_shutdown_routines...\n",
roctracer_shutdown_routines().size());
for(auto& itr : roctracer_shutdown_routines())
itr.second();
#if ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 4
ROCPROFSYS_DEBUG_F("redirecting roctracer warnings\n");
// HIP 4.5.0 has an invalid warning
redirect _rd{
std::cerr, "roctracer_disable_callback(), get_op_end(), invalid domain ID(4) "
"in: roctracer_disable_callback()roctracer_disable_activity(), "
"get_op_end(), invalid domain ID(4) in: roctracer_disable_activity()"
};
#endif
if(get_trace_hip_api())
{
ROCPROFSYS_VERBOSE_F(
2,
"executing roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)...\n");
ROCPROFSYS_ROCTRACER_CALL(
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API));
}
if(get_use_roctx())
{
ROCPROFSYS_VERBOSE_F(
2, "executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_ROCTX)...\n");
ROCPROFSYS_ROCTRACER_CALL(
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX));
}
if(get_trace_hip_activity())
{
ROCPROFSYS_VERBOSE_F(
2,
"executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)...\n");
ROCPROFSYS_ROCTRACER_CALL(
roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS));
}
if(get_trace_hsa_api())
{
ROCPROFSYS_VERBOSE_F(
2,
"executing roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_API)...\n");
ROCPROFSYS_ROCTRACER_CALL(
roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_API));
}
if(get_trace_hsa_api())
{
ROCPROFSYS_VERBOSE_F(
2, "executing roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, "
"HSA_OP_ID_COPY)...\n");
ROCPROFSYS_ROCTRACER_CALL(
roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY));
}
ROCPROFSYS_VERBOSE_F(1, "roctracer is shutdown\n");
}
scope::transient_destructor
roctracer::protect_flush_activity()
{
return scope::transient_destructor([]() { --roctracer_activity_count(); },
[]() { ++roctracer_activity_count(); });
}
} // namespace component
} // namespace rocprofsys
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer, false, void)
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(roctracer_data, true, double)
-117
ファイルの表示
@@ -1,117 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/common.hpp"
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include <timemory/api.hpp>
#include <timemory/components/base.hpp>
#include <timemory/components/data_tracker/components.hpp>
#include <timemory/components/macros.hpp>
#include <timemory/enum.h>
#include <timemory/macros/os.hpp>
#include <timemory/mpl/type_traits.hpp>
#include <timemory/mpl/types.hpp>
#include <timemory/utility/transient_function.hpp>
ROCPROFSYS_COMPONENT_ALIAS(roctracer_data,
::tim::component::data_tracker<double, roctracer>)
namespace rocprofsys
{
namespace component
{
struct roctracer
: base<roctracer, void>
, private policy::instance_tracker<roctracer, false>
{
using value_type = void;
using base_type = base<roctracer, void>;
using tracker_type = policy::instance_tracker<roctracer, false>;
ROCPROFSYS_DEFAULT_OBJECT(roctracer)
static void preinit();
static void global_finalize() { shutdown(); }
static bool is_setup();
static void setup(void* hsa_api_table, bool on_load_trace = false);
static void flush();
static void shutdown();
static void add_setup(const std::string&, std::function<void()>&&);
static void add_shutdown(const std::string&, std::function<void()>&&);
static void remove_setup(const std::string&);
static void remove_shutdown(const std::string&);
void start();
void stop();
// this function protects roctracer_flush_activty from being called
// when rocprof-sys exits during a callback
[[nodiscard]] static scope::transient_destructor protect_flush_activity();
};
#if !defined(ROCPROFSYS_USE_ROCTRACER)
inline void
roctracer::setup(void*, bool)
{}
inline void
roctracer::flush()
{}
inline void
roctracer::shutdown()
{}
inline bool
roctracer::is_setup()
{
return false;
}
#endif
} // namespace component
} // namespace rocprofsys
#if !defined(ROCPROFSYS_USE_ROCTRACER)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer_data, false_type)
#endif
TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer_data, project::timemory,
category::timing, os::supports_unix)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type)
#if defined(ROCPROFSYS_USE_ROCTRACER) && ROCPROFSYS_USE_ROCTRACER > 0
# if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
# include <timemory/operations.hpp>
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer, false, void)
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double)
# endif
#endif
+8 -202
ファイルの表示
@@ -25,12 +25,8 @@
#include "core/debug.hpp"
#include "core/dynamic_library.hpp"
#include "core/gpu.hpp"
#include "library/components/rocprofiler.hpp"
#include "library/components/roctracer.hpp"
#include "library/rocm/hsa_rsrc_factory.hpp"
#include "library/rocm_smi.hpp"
#include "library/rocprofiler.hpp"
#include "library/roctracer.hpp"
#include "library/rocprofiler-sdk.hpp"
#include "library/runtime.hpp"
#include "library/thread_data.hpp"
#include "library/tracing.hpp"
@@ -46,208 +42,18 @@
#include <mutex>
#include <tuple>
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
# include <rocprofiler.h>
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
# include <rocprofiler-sdk/rocprofiler.h>
#endif
using namespace rocprofsys;
namespace rocprofsys
{
namespace rocm
{
std::mutex rocm_mutex = {};
bool is_loaded = false;
bool on_load_trace = (get_env<int>("ROCP_ONLOAD_TRACE", 0) > 0);
std::vector<hardware_counter_info>
rocm_events()
{
return rocprofiler_sdk::get_rocm_events_info();
}
} // namespace rocm
} // namespace rocprofsys
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
std::ostream&
operator<<(std::ostream& _os, const rocprofiler_settings_t& _v)
{
# define ROCPROF_SETTING_FIELD_STR(NAME) JOIN('=', # NAME, _v.NAME)
_os << JOIN(
", ", ROCPROF_SETTING_FIELD_STR(intercept_mode),
ROCPROF_SETTING_FIELD_STR(code_obj_tracking),
ROCPROF_SETTING_FIELD_STR(memcopy_tracking),
ROCPROF_SETTING_FIELD_STR(trace_size), ROCPROF_SETTING_FIELD_STR(trace_local),
ROCPROF_SETTING_FIELD_STR(timeout), ROCPROF_SETTING_FIELD_STR(timestamp_on),
ROCPROF_SETTING_FIELD_STR(hsa_intercepting),
ROCPROF_SETTING_FIELD_STR(k_concurrent), ROCPROF_SETTING_FIELD_STR(opt_mode),
ROCPROF_SETTING_FIELD_STR(obj_dumping));
return _os;
}
#endif
// HSA-runtime tool on-load method
extern "C"
{
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
void OnUnloadTool()
{
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n");
rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
if(!_lk.owns_lock()) _lk.lock();
if(!rocm::is_loaded)
{
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace,
"rocprofiler is not loaded\n");
return;
}
rocm::is_loaded = false;
_lk.unlock();
// stop_top_level_timer_if_necessary();
// Final resources cleanup
rocprofsys::rocprofiler::rocm_cleanup();
}
void OnLoadToolProp(rocprofiler_settings_t* settings)
{
using ::rocprofiler::util::HsaRsrcFactory;
if(!config::get_use_rocprofiler() || config::get_rocm_events().empty()) return;
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n");
rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
if(!_lk.owns_lock()) _lk.lock();
if(rocm::is_loaded)
{
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace,
"rocprofiler is already loaded\n");
return;
}
rocm::is_loaded = true;
_lk.unlock();
// Enable timestamping
settings->timestamp_on = 1;
settings->intercept_mode = 1;
settings->hsa_intercepting = 1;
settings->k_concurrent = 0;
settings->obj_dumping = 0;
// settings->code_obj_tracking = 0;
// settings->memcopy_tracking = 0;
// settings->trace_local = 1;
// settings->opt_mode = 1;
// settings->trace_size = 0;
// settings->timeout = 0;
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "rocprofiler settings: %s\n",
JOIN("", *settings).c_str());
// Initialize profiling
rocprofsys::rocprofiler::rocm_initialize();
HsaRsrcFactory::Instance().PrintGpuAgents("ROCm");
}
#endif
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names)
{
tim::consume_parameters(table, runtime_version, failed_tool_count,
failed_tool_names);
static bool _once = false;
if(_once) return true;
_once = true;
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n");
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) return true;
if(!tim::settings::enabled()) return true;
roctracer_is_init() = true;
ROCPROFSYS_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "Loading ROCm tooling...\n");
if(!config::settings_are_configured() && get_state() < State::Active)
rocprofsys_init_tooling_hidden();
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
#if ROCPROFSYS_HIP_VERSION < 50300
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
"Computing the roctracer clock skew...\n");
(void) rocprofsys::get_clock_skew();
#endif
if(get_use_process_sampling() && get_use_rocm_smi())
{
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
"Setting rocm_smi state to active...\n");
rocm_smi::set_state(State::Active);
}
comp::roctracer::setup(static_cast<void*>(table), rocm::on_load_trace);
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
bool _force_rocprofiler_init =
tim::get_env("ROCPROFSYS_FORCE_ROCPROFILER_INIT", false, false);
#else
bool _force_rocprofiler_init = false;
#endif
bool _success = true;
bool _is_empty =
(config::settings_are_configured() && config::get_rocm_events().empty());
if(_force_rocprofiler_init || (get_use_rocprofiler() && !_is_empty))
{
#if ROCPROFSYS_HIP_VERSION < 50500
auto _rocprof = dynamic_library{
"ROCPROFSYS_ROCPROFILER_LIBRARY",
find_library_path(
"librocprofiler64.so", { "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
{ ROCPROFSYS_DEFAULT_ROCM_PATH },
{ "lib", "lib64", "rocprofiler/lib", "rocprofiler/lib64" }),
(RTLD_LAZY | RTLD_GLOBAL), false
};
ROCPROFSYS_VERBOSE_F(1 || rocm::on_load_trace,
"Loading rocprofiler library (%s=%s)...\n",
_rocprof.envname.c_str(), _rocprof.filename.c_str());
_rocprof.open();
on_load_t _rocprof_load = nullptr;
_success = _rocprof.invoke("OnLoad", _rocprof_load, table, runtime_version,
failed_tool_count, failed_tool_names);
ROCPROFSYS_CONDITIONAL_PRINT_F(!_success,
"Warning! Invoking rocprofiler's OnLoad "
"failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n",
_rocprof.filename.c_str());
ROCPROFSYS_CI_THROW(!_success,
"Warning! Invoking rocprofiler's OnLoad "
"failed! ROCPROFSYS_ROCPROFILER_LIBRARY=%s\n",
_rocprof.filename.c_str());
#endif
}
else
{
using ::rocprofiler::util::HsaRsrcFactory;
HsaRsrcFactory::Instance().PrintGpuAgents("ROCm");
}
gpu::add_hip_device_metadata();
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading... %s\n",
(_success) ? "Done" : "Failed");
return _success;
}
// HSA-runtime on-unload method
void OnUnload()
{
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading...\n");
rocprofsys_finalize_hidden();
ROCPROFSYS_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Unloading... Done\n");
}
}
+26 -14
ファイルの表示
@@ -23,36 +23,48 @@
#pragma once
#include "core/defines.hpp"
#include "core/timemory.hpp"
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
# include <rocprofiler.h>
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
# include <rocprofiler-sdk/registration.h>
# include <rocprofiler-sdk/rocprofiler.h>
#endif
#include <cstdint>
#include <mutex>
#include <vector>
namespace rocprofsys
{
namespace rocm
{
using lock_t = std::unique_lock<std::mutex>;
using hardware_counter_info = ::tim::hardware_counters::info;
extern std::mutex rocm_mutex;
extern bool is_loaded;
std::vector<hardware_counter_info>
rocm_events();
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
inline std::vector<hardware_counter_info>
rocm_events()
{
return std::vector<hardware_counter_info>();
}
#endif
} // namespace rocm
} // namespace rocprofsys
extern "C"
{
struct HsaApiTable;
using on_load_t = bool (*)(HsaApiTable*, uint64_t, uint64_t, const char* const*);
struct rocprofiler_tool_configure_result_t;
struct rocprofiler_client_id_t;
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) ROCPROFSYS_PUBLIC_API;
void OnUnload() ROCPROFSYS_PUBLIC_API;
using rocprofiler_configure_t =
rocprofiler_tool_configure_result_t* (*) (uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* client_id);
#if defined(ROCPROFSYS_USE_ROCPROFILER) && ROCPROFSYS_USE_ROCPROFILER > 0
void OnLoadToolProp(rocprofiler_settings_t* settings) ROCPROFSYS_PUBLIC_API;
void OnUnloadTool() ROCPROFSYS_PUBLIC_API;
#endif
rocprofiler_tool_configure_result_t* rocprofiler_configure(
uint32_t version, const char* runtime_version, uint32_t priority,
rocprofiler_client_id_t* client_id) ROCPROFSYS_PUBLIC_API;
}
-7
ファイルの表示
@@ -1,7 +0,0 @@
#
if(ROCPROFSYS_USE_ROCPROFILER OR ROCPROFSYS_USE_ROCTRACER)
target_sources(
rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.hpp
${CMAKE_CURRENT_LIST_DIR}/hsa_rsrc_factory.cpp)
endif()
ファイル差分が大きすぎるため省略します 差分を読み込み
-582
ファイルの表示
@@ -1,582 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/exception.hpp"
#define AMD_INTERNAL_BUILD 1
#include <hsa.h>
#include <hsa_api_trace.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <hsa_ven_amd_aqlprofile.h>
#include <hsa_ven_amd_loader.h>
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <iostream>
#include <map>
#include <mutex>
#include <string>
#include <vector>
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define HSA_MESSAGE_LENGTH 4096
#define CHECK_STATUS(msg, status) \
do \
{ \
if((status) != HSA_STATUS_SUCCESS) \
{ \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
char _buffer[HSA_MESSAGE_LENGTH]; \
snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \
emsg ? emsg : "<unknown error>"); \
throw ::rocprofsys::exception<std::runtime_error>(_buffer); \
} \
} while(0)
#define CHECK_ITER_STATUS(msg, status) \
do \
{ \
if((status) != HSA_STATUS_INFO_BREAK) \
{ \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
char _buffer[HSA_MESSAGE_LENGTH]; \
snprintf(_buffer, HSA_MESSAGE_LENGTH - 1, "%s: %s", msg, \
emsg ? emsg : "<unknown error>"); \
throw ::rocprofsys::exception<std::runtime_error>(_buffer); \
} \
} while(0)
namespace rocprofiler
{
namespace util
{
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
struct hsa_pfn_t
{
decltype(::hsa_init)* hsa_init;
decltype(::hsa_shut_down)* hsa_shut_down;
decltype(::hsa_agent_get_info)* hsa_agent_get_info;
decltype(::hsa_iterate_agents)* hsa_iterate_agents;
decltype(::hsa_queue_create)* hsa_queue_create;
decltype(::hsa_queue_destroy)* hsa_queue_destroy;
decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(
::hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
decltype(::hsa_signal_create)* hsa_signal_create;
decltype(::hsa_signal_destroy)* hsa_signal_destroy;
decltype(::hsa_signal_load_relaxed)* hsa_signal_load_relaxed;
decltype(::hsa_signal_store_relaxed)* hsa_signal_store_relaxed;
decltype(::hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire;
decltype(::hsa_signal_store_screlease)* hsa_signal_store_screlease;
decltype(::hsa_code_object_reader_create_from_file)*
hsa_code_object_reader_create_from_file;
decltype(::hsa_executable_create_alt)* hsa_executable_create_alt;
decltype(
::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(::hsa_executable_freeze)* hsa_executable_freeze;
decltype(::hsa_executable_destroy)* hsa_executable_destroy;
decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(::hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
decltype(::hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
decltype(::hsa_system_get_info)* hsa_system_get_info;
decltype(
::hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
decltype(::hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools;
decltype(::hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info;
decltype(::hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate;
decltype(::hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access;
decltype(::hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy;
decltype(::hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler;
decltype(
::hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled;
decltype(
::hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time;
decltype(::hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time;
};
// Encapsulates information about a Hsa Agent such as its
// handle, name, max queue size, max wavefront size, etc.
struct AgentInfo
{
// Handle of Agent
hsa_agent_t dev_id;
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// APU flag
bool is_apu;
// Agent system index
uint32_t dev_index;
// GFXIP name
char gfxip[64];
// Name of Agent whose length is less than 64
char name[64];
// Max size of Wavefront size
uint32_t max_wave_size;
// Max size of Queue buffer
uint32_t max_queue_size;
// Hsail profile supported by agent
hsa_profile_t profile;
// CPU/GPU/kern-arg memory pools
hsa_amd_memory_pool_t cpu_pool;
hsa_amd_memory_pool_t gpu_pool;
hsa_amd_memory_pool_t kern_arg_pool;
// The number of compute unit available in the agent.
uint32_t cu_num;
// Maximum number of waves possible in a Compute Unit.
uint32_t waves_per_cu;
// Number of SIMD's per compute unit CU
uint32_t simds_per_cu;
// Number of Shader Engines (SE) in Gpu
uint32_t se_num;
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
// SGPR/VGPR/LDS block sizes
uint32_t sgpr_block_dflt;
uint32_t sgpr_block_size;
uint32_t vgpr_block_size;
static const uint32_t lds_block_size = 128 * 4;
};
// HSA timer class
// Provides current HSA timestampa and system-clock/ns conversion API
class HsaTimer
{
public:
typedef uint64_t timestamp_t;
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
enum time_id_t
{
TIME_ID_CLOCK_REALTIME = 0,
TIME_ID_CLOCK_REALTIME_COARSE = 1,
TIME_ID_CLOCK_MONOTONIC = 2,
TIME_ID_CLOCK_MONOTONIC_COARSE = 3,
TIME_ID_CLOCK_MONOTONIC_RAW = 4,
TIME_ID_NUMBER
};
HsaTimer(const hsa_pfn_t* hsa_api)
: hsa_api_(hsa_api)
{
timestamp_t sysclock_hz = 0;
hsa_status_t status = hsa_api_->hsa_system_get_info(
HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t) 1000000000 / (freq_t) sysclock_hz;
}
// Methods for system-clock/ns conversion
timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const
{
return timestamp_t((freq_t) sysclock * sysclock_factor_);
}
timestamp_t ns_to_sysclock(const timestamp_t& time) const
{
return timestamp_t((freq_t) time / sysclock_factor_);
}
// Method for timespec/ns conversion
static timestamp_t timespec_to_ns(const timespec& time)
{
return ((timestamp_t) time.tv_sec * 1000000000) + time.tv_nsec;
}
// Return timestamp in 'ns'
timestamp_t timestamp_ns() const
{
timestamp_t sysclock;
hsa_status_t status =
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status);
return sysclock_to_ns(sysclock);
}
// Return time in 'ns'
timestamp_t clocktime_ns(clockid_t clock_id) const
{
timespec time;
clock_gettime(clock_id, &time);
return timespec_to_ns(time);
}
// Return pair of correlated values of profiling timestamp and time with
// correlation error for a given time ID and number of iterations
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
timestamp_t* time_v, timestamp_t* error_v)
{
clockid_t clock_id = 0;
switch(time_id)
{
case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break;
case TIME_ID_CLOCK_REALTIME_COARSE: clock_id = CLOCK_REALTIME_COARSE; break;
case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break;
case TIME_ID_CLOCK_MONOTONIC_COARSE: clock_id = CLOCK_MONOTONIC_COARSE; break;
case TIME_ID_CLOCK_MONOTONIC_RAW: clock_id = CLOCK_MONOTONIC_RAW; break;
default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR);
}
std::vector<timestamp_t> ts_vec(iters);
std::vector<timespec> tm_vec(iters);
const uint32_t steps = iters - 1;
for(uint32_t i = 0; i < iters; ++i)
{
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]);
clock_gettime(clock_id, &tm_vec[i]);
}
const timestamp_t ts_base = sysclock_to_ns(ts_vec.front());
const timestamp_t tm_base = timespec_to_ns(tm_vec.front());
const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps);
timestamp_t ts_accum = 0;
timestamp_t tm_accum = 0;
for(uint32_t i = 0; i < iters; ++i)
{
ts_accum += (ts_vec[i] - ts_base);
tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base);
}
*timestamp_v = (ts_accum / iters) + ts_base + error;
*time_v = (tm_accum / iters) + tm_base;
*error_v = error;
}
private:
// Timestamp frequency factor
freq_t sysclock_factor_;
// HSA API table
const hsa_pfn_t* const hsa_api_;
};
class HsaRsrcFactory
{
public:
static const size_t CMD_SLOT_SIZE_B = 0x40;
typedef std::recursive_mutex mutex_t;
typedef HsaTimer::timestamp_t timestamp_t;
static HsaRsrcFactory* Create(bool initialize_hsa = true)
{
std::lock_guard<mutex_t> lck(mutex_);
HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed);
if(obj == nullptr)
{
obj = new HsaRsrcFactory(initialize_hsa);
instance_.store(obj, std::memory_order_release);
}
return obj;
}
static HsaRsrcFactory& Instance()
{
HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire);
if(obj == nullptr) obj = Create(false);
hsa_status_t status = (obj != nullptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *obj;
}
static void Destroy()
{
std::lock_guard<mutex_t> lck(mutex_);
if(instance_) delete instance_.load();
instance_ = nullptr;
}
// Return system agent info
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
// Get the count of Hsa Gpu Agents available on the platform
// @return uint32_t Number of Gpu agents on platform
uint32_t GetCountOfGpuAgents();
// Get the count of Hsa Cpu Agents available on the platform
// @return uint32_t Number of Cpu agents on platform
uint32_t GetCountOfCpuAgents();
// Get the AgentInfo handle of a Gpu device
// @param idx Gpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Get the AgentInfo handle of a Cpu device
// @param idx Cpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
// @param agent_info Gpu Agent on which to create a queue object
// @param num_Pkts Number of packets to be held by queue
// @param queue Output parameter updated with handle of queue object
// @return bool true if successful, false otherwise
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
// Create a Signal object and return its handle.
// @param value Initial value of signal object
// @param signal Output parameter updated with handle of signal object
// @return bool true if successful, false otherwise
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
// Allocate local GPU memory
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size);
// Allocate memory tp pass kernel parameters
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo
// parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size);
// Allocate system memory accessible from both CPU and GPU
// Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size);
// Allocate memory for command buffer.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Copy data from GPU to host memory
bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
// Memory free method
static bool FreeMemory(void* ptr);
// Loads an Assembled Brig file and Finalizes it into Device Isa
// @param agent_info Gpu device for which to finalize
// @param brig_path File path of the Assembled Brig file
// @param kernel_name Name of the kernel to finalize
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
// @return true if successful, false otherwise
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* hsa_exec,
hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Utils for submitting AQL packet to a given queue
static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx);
static void* GetReadPointer(hsa_queue_t* queue);
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
// Enable executables loading tracking
static bool IsExecutableTracking() { return executable_tracking_on_; }
static void EnableExecutableTracking(HsaApiTable* table);
static const char* GetKernelNameRef(uint64_t addr);
// Initialize HSA API table
void static InitHsaApiTable(HsaApiTable* table);
static const hsa_pfn_t* HsaApi() { return &hsa_api_; }
// Return AqlProfile API table
typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t;
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
// Return Loader API table
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
// Methods for system-clock/ns conversion and timestamp in 'ns'
timestamp_t SysclockToNs(const timestamp_t& sysclock) const
{
return timer_->sysclock_to_ns(sysclock);
}
timestamp_t NsToSysclock(const timestamp_t& time) const
{
return timer_->ns_to_sysclock(time);
}
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
timestamp_t GetSysTimeout() const { return timeout_; }
static timestamp_t GetTimeoutNs() { return timeout_ns_; }
static void SetTimeoutNs(const timestamp_t& time)
{
std::lock_guard<mutex_t> lck(mutex_);
timeout_ns_ = time;
if(instance_ != nullptr)
Instance().timeout_ = Instance().timer_->ns_to_sysclock(time);
}
void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters)
{
timestamp_t timestamp_v = 0;
timestamp_t time_v = 0;
timestamp_t error_v = 0;
timer_->correlated_pair_ns(time_id, iters, &timestamp_v, &time_v, &error_v);
time_shift_[time_id] = time_v - timestamp_v;
time_error_[time_id] = error_v;
}
hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value)
{
if(time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR;
*time_value = time_stamp + time_shift_[time_id];
return HSA_STATUS_SUCCESS;
}
hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err)
{
*err = time_error_[time_id];
return HSA_STATUS_SUCCESS;
}
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
// Callback function to find and bind kernarg region of an agent
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
// Load AQL profile HSA extension library directly
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
explicit HsaRsrcFactory(bool initialize_hsa);
// Destructor of the class
~HsaRsrcFactory();
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
// To mmap command buffer memory
static const bool CMD_MEMORY_MMAP = false;
// HSA was initialized
const bool initialize_hsa_;
static std::atomic<HsaRsrcFactory*> instance_;
static mutex_t mutex_;
// Used to maintain a list of Hsa Gpu Agent Info
std::vector<const AgentInfo*> gpu_list_;
std::vector<hsa_agent_t> gpu_agents_;
// Used to maintain a list of Hsa Cpu Agent Info
std::vector<const AgentInfo*> cpu_list_;
std::vector<hsa_agent_t> cpu_agents_;
// System agents map
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
// Executables loading tracking
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static void* to_dump_code_obj_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options);
static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec,
hsa_executable_symbol_t symbol, void* data);
// HSA runtime API table
static hsa_pfn_t hsa_api_;
// AqlProfile API table
aqlprofile_pfn_t aqlprofile_api_;
// Loader API table
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
// System timeout, ns
static timestamp_t timeout_ns_;
// System timeout, sysclock
timestamp_t timeout_;
// HSA timer
HsaTimer* timer_;
// Time shift array to support time conversion
timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER];
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
// CPU/kern-arg memory pools
hsa_amd_memory_pool_t* cpu_pool_;
hsa_amd_memory_pool_t* kern_arg_pool_;
};
} // namespace util
} // namespace rocprofiler
+3 -2
ファイルの表示
@@ -128,7 +128,8 @@ private:
static bool shutdown();
};
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
#if !defined(ROCPROFSYS_USE_ROCM) || ROCPROFSYS_USE_ROCM == 0
inline void
setup()
{}
@@ -154,7 +155,7 @@ inline void set_state(State) {}
} // namespace rocm_smi
} // namespace rocprofsys
#if defined(ROCPROFSYS_USE_ROCM_SMI) && ROCPROFSYS_USE_ROCM_SMI > 0
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
# if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \
(defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0)
ファイル差分が大きすぎるため省略します 差分を読み込み
@@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@@ -22,67 +22,39 @@
#pragma once
#include "core/defines.hpp"
#include "core/timemory.hpp"
#include "library/components/rocprofiler.hpp"
#include <timemory/backends/hardware_counters.hpp>
#include <timemory/macros.hpp>
#include <timemory/mpl/concepts.hpp>
#include <timemory/mpl/macros.hpp>
#include <array>
#include <atomic>
#include <cstring>
#include <dlfcn.h>
#include <iostream>
#include <list>
#include <map>
#include <string>
#include <string_view>
#include <tuple>
#include <unistd.h>
#include <utility>
#include <variant>
#include <memory>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler
namespace rocprofiler_sdk
{
std::map<uint32_t, std::vector<std::string_view>>
get_data_labels();
using hardware_counter_info = ::tim::hardware_counters::info;
void
rocm_initialize();
setup();
void
rocm_cleanup();
shutdown();
bool&
is_setup();
void
config();
void
post_process();
std::vector<component::rocm_info_entry>
rocm_metrics();
void
sample();
#if !defined(ROCPROFSYS_USE_ROCPROFILER) || ROCPROFSYS_USE_ROCPROFILER == 0
inline void
post_process()
{}
void
start();
inline void
rocm_cleanup()
{}
void
stop();
inline std::vector<component::rocm_info_entry>
rocm_metrics()
{
return std::vector<component::rocm_info_entry>{};
}
#endif
} // namespace rocprofiler
std::vector<hardware_counter_info>
get_rocm_events_info();
} // namespace rocprofiler_sdk
} // namespace rocprofsys
@@ -0,0 +1,9 @@
#
set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp
${CMAKE_CURRENT_LIST_DIR}/fwd.cpp)
set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp
${CMAKE_CURRENT_LIST_DIR}/fwd.hpp)
target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources}
${rocprofiler_sdk_headers})
@@ -0,0 +1,135 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/rocprofiler-sdk/counters.hpp"
#include "common/synchronized.hpp"
#include "core/debug.hpp"
#include "core/timemory.hpp"
#include "library/rocprofiler-sdk/fwd.hpp"
#include <timemory/utility/types.hpp>
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/dispatch_counting_service.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <memory>
#include <unordered_map>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler_sdk
{
namespace
{
std::string
get_counter_description(const client_data* tool_data, std::string_view _v)
{
const auto& _info = tool_data->events_info;
for(const auto& itr : _info)
{
if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0)
{
return itr.long_description();
}
}
return std::string{};
}
} // namespace
void
counter_event::operator()(const client_data* tool_data, ::perfetto::CounterTrack* _track,
timing_interval _timing, scope::config _scope) const
{
if(!record.dispatch_data) return;
const auto& _dispatch_info = record.dispatch_data->dispatch_info;
const auto* _kern_sym_data =
tool_data->get_kernel_symbol_info(_dispatch_info.kernel_id);
auto _bundle = counter_bundle_t{ tim::demangle(_kern_sym_data->kernel_name), _scope };
_bundle.push(_dispatch_info.queue_id.handle)
.start()
.store(record.record_counter.counter_value);
_bundle.stop().pop(_dispatch_info.queue_id.handle);
if(_track && _timing.start > 0 && _timing.end > _timing.start)
{
TRACE_COUNTER(trait::name<category::rocm_counter_collection>::value, *_track,
_timing.start, record.record_counter.counter_value);
TRACE_COUNTER(trait::name<category::rocm_counter_collection>::value, *_track,
_timing.end, 0);
}
}
counter_storage::counter_storage(const client_data* _tool_data, uint64_t _devid,
size_t _idx, std::string_view _name)
: tool_data{ _tool_data }
, device_id{ _devid }
, index{ static_cast<int64_t>(_idx) }
, metric_name{ _name }
, metric_description{ get_counter_description(_tool_data, metric_name) }
{
auto _metric_name = std::string{ _name };
_metric_name =
std::regex_replace(_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2");
storage_name = JOIN('-', "rocprof", "device", device_id, _metric_name);
storage = std::make_unique<counter_storage_type>(tim::standalone_storage{}, index,
storage_name);
{
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_COUNT;
track_name = JOIN(" ", "GPU", _metric_name, JOIN("", '[', device_id, ']'));
track = std::make_unique<counter_track_type>(
::perfetto::StaticString(track_name.c_str()));
track->set_is_incremental(false);
track->set_unit(_unit);
track->set_unit_multiplier(1);
}
}
void
counter_storage::operator()(const counter_event& _event, timing_interval _timing,
scope::config _scope) const
{
operation::set_storage<counter_data_tracker>{}(storage.get());
_event(tool_data, track.get(), _timing, _scope);
}
void
counter_storage::write() const
{
operation::set_storage<counter_data_tracker>{}(storage.get());
counter_data_tracker::label() = metric_name;
counter_data_tracker::description() = metric_description;
storage->write();
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
@@ -0,0 +1,168 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "common/synchronized.hpp"
#include "core/debug.hpp"
#include "core/perfetto.hpp"
#include "core/timemory.hpp"
#include "library/rocprofiler-sdk/fwd.hpp"
#include <timemory/utility/types.hpp>
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/dispatch_counting_service.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <memory>
#include <unordered_map>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler_sdk
{
struct counter_dispatch_record
{
const rocprofiler_dispatch_counting_service_data_t* dispatch_data = nullptr;
rocprofiler_dispatch_id_t dispatch_id = 0;
rocprofiler_counter_id_t counter_id = {};
rocprofiler_record_counter_t record_counter = {};
};
struct counter_data_tag
{};
using counter_data_tracker = component::data_tracker<double, counter_data_tag>;
using counter_storage_type = typename counter_data_tracker::storage_type;
using counter_bundle_t = tim::lightweight_tuple<counter_data_tracker>;
using counter_track_type = ::perfetto::CounterTrack;
struct counter_event
{
ROCPROFSYS_DEFAULT_OBJECT(counter_event)
explicit counter_event(counter_dispatch_record&& _v)
: record{ _v }
{}
void operator()(const client_data* tool_data, counter_track_type*,
timing_interval _timing, scope::config _scope) const;
counter_dispatch_record record = {};
};
struct counter_storage
{
const client_data* tool_data = nullptr;
uint64_t device_id = 0;
int64_t index = 0;
std::string metric_name = {};
std::string metric_description = {};
std::string storage_name = {};
std::string track_name = {};
std::unique_ptr<counter_storage_type> storage = {};
std::unique_ptr<counter_track_type> track = {};
counter_storage(const client_data* _tool_data, uint64_t _devid, size_t _idx,
std::string_view _name);
~counter_storage() = default;
counter_storage(const counter_storage&) = delete;
counter_storage(counter_storage&&) = default;
counter_storage& operator=(const counter_storage&) = delete;
counter_storage& operator=(counter_storage&&) = default;
friend bool operator<(const counter_storage& lhs, const counter_storage& rhs)
{
return std::tie(lhs.storage_name, lhs.device_id, lhs.index) <
std::tie(rhs.storage_name, rhs.device_id, rhs.index);
}
void operator()(const counter_event& _event, timing_interval _timing,
scope::config _scope = scope::get_default()) const;
void write() const;
};
} // namespace rocprofiler_sdk
} // namespace rocprofsys
namespace tim
{
namespace operation
{
template <>
struct set_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker>
{
static constexpr size_t max_threads = 4096;
using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker;
using storage_array_t = std::array<storage<type>*, max_threads>;
friend struct get_storage<rocprofsys::rocprofiler_sdk::counter_data_tracker>;
ROCPROFSYS_DEFAULT_OBJECT(set_storage)
auto operator()(storage<type>* _v, size_t _idx) const { get().at(_idx) = _v; }
auto operator()(type&, size_t) const {}
auto operator()(storage<type>* _v) const { get().fill(_v); }
private:
static storage_array_t& get()
{
static storage_array_t _v = { nullptr };
return _v;
}
};
template <>
struct get_storage<::rocprofsys::rocprofiler_sdk::counter_data_tracker>
{
using type = ::rocprofsys::rocprofiler_sdk::counter_data_tracker;
ROCPROFSYS_DEFAULT_OBJECT(get_storage)
auto operator()(const type&) const
{
return operation::set_storage<type>::get().at(0);
}
auto operator()() const
{
type _obj{};
return (*this)(_obj);
}
auto operator()(size_t _idx) const
{
return operation::set_storage<type>::get().at(_idx);
}
auto operator()(type&, size_t _idx) const { return (*this)(_idx); }
};
} // namespace operation
} // namespace tim
+270
ファイルの表示
@@ -0,0 +1,270 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/rocprofiler-sdk/fwd.hpp"
#include "core/debug.hpp"
#include "core/state.hpp"
#include <timemory/utility/join.hpp>
#include <exception>
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <algorithm>
#include <utility>
namespace rocprofsys
{
namespace rocprofiler_sdk
{
namespace
{
using tool_agent_vec_t = std::vector<tool_agent>;
rocprofiler_status_t
dimensions_info_callback(rocprofiler_counter_id_t /*id*/,
const rocprofiler_record_dimension_info_t* dim_info,
long unsigned int num_dims, void* user_data)
{
auto* dimensions_info =
static_cast<std::vector<rocprofiler_record_dimension_info_t>*>(user_data);
dimensions_info->reserve(num_dims);
for(size_t j = 0; j < num_dims; j++)
dimensions_info->emplace_back(dim_info[j]);
return ROCPROFILER_STATUS_SUCCESS;
}
rocprofiler_status_t
counters_supported_callback(rocprofiler_agent_id_t agent_id,
rocprofiler_counter_id_t* counters, size_t num_counters,
void* user_data)
{
using value_type = typename agent_counter_info_map_t::mapped_type;
auto* data_v = static_cast<agent_counter_info_map_t*>(user_data);
data_v->emplace(agent_id, value_type{});
for(size_t i = 0; i < num_counters; ++i)
{
auto _info = rocprofiler_counter_info_v0_t{};
auto _dim_info = std::vector<rocprofiler_record_dimension_info_t>{};
ROCPROFILER_CALL(rocprofiler_query_counter_info(
counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &_info));
// populate local vector
ROCPROFILER_CALL(rocprofiler_iterate_counter_dimensions(
counters[i], dimensions_info_callback, &_dim_info));
if(!_info.is_constant)
data_v->at(agent_id).emplace_back(agent_id, _info, std::move(_dim_info));
}
return ROCPROFILER_STATUS_SUCCESS;
}
agent_counter_info_map_t
get_agent_counter_info(const tool_agent_vec_t& _agents)
{
auto _data = agent_counter_info_map_t{};
for(auto itr : _agents)
{
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
itr.agent->id, counters_supported_callback, &_data));
std::sort(_data.at(itr.agent->id).begin(), _data.at(itr.agent->id).end(),
[](const auto& lhs, const auto& rhs) {
return (lhs.id.handle < rhs.id.handle);
});
for(auto& citr : _data.at(itr.agent->id))
{
std::sort(citr.dimension_info.begin(), citr.dimension_info.end(),
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
}
}
return _data;
}
} // namespace
rocprofiler_tool_counter_info_t::rocprofiler_tool_counter_info_t(
rocprofiler_agent_id_t _agent_id, parent_type _info, dimension_info_vec_t&& _dim_info)
: parent_type{ _info }
, agent_id{ _agent_id }
, dimension_info{ std::move(_dim_info) }
{}
void
client_data::initialize()
{
buffered_tracing_info = rocprofiler::sdk::get_buffer_tracing_names();
callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
static constexpr auto supported_agent_info_version = ROCPROFILER_AGENT_INFO_VERSION_0;
rocprofiler_query_available_agents_cb_t iterate_cb =
[](rocprofiler_agent_version_t version, const void** agents_arr,
size_t num_agents, void* user_data) {
ROCPROFSYS_CONDITIONAL_ABORT(version != supported_agent_info_version,
"rocprofiler agent info version != expected "
"agent info version (=%i). value: %i\n",
supported_agent_info_version, version);
auto _agents_v = std::vector<rocprofiler_agent_v0_t>{};
for(size_t i = 0; i < num_agents; ++i)
{
const auto* _agent =
static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
_agents_v.emplace_back(*_agent);
}
auto* tool_data_v = as_client_data(user_data);
tool_data_v->set_agents(std::move(_agents_v));
return ROCPROFILER_STATUS_SUCCESS;
};
ROCPROFILER_CALL(rocprofiler_query_available_agents(
supported_agent_info_version, iterate_cb, sizeof(rocprofiler_agent_t), this));
}
void
client_data::initialize_event_info()
{
if(agents.empty()) initialize();
if(agent_counter_info.size() != gpu_agents.size())
agent_counter_info = get_agent_counter_info(gpu_agents);
try
{
using qualifier_t = tim::hardware_counters::qualifier;
using qualifier_vec_t = std::vector<qualifier_t>;
for(const auto& aitr : gpu_agents)
{
auto _dev_index = aitr.device_id;
auto _device_qualifier_sym = JOIN("", ":device=", _dev_index);
auto _device_qualifier =
tim::hardware_counters::qualifier{ true, static_cast<int>(_dev_index),
_device_qualifier_sym,
JOIN(" ", "Device", _dev_index) };
auto _counter_info = agent_counter_info.at(aitr.agent->id);
std::sort(_counter_info.begin(), _counter_info.end(),
[](const rocprofiler_tool_counter_info_t& lhs,
const rocprofiler_tool_counter_info_t& rhs) {
if(lhs.is_constant && rhs.is_constant)
return lhs.id < rhs.id;
else if(lhs.is_constant)
return true;
else if(rhs.is_constant)
return false;
if(!lhs.is_derived && !rhs.is_derived)
return lhs.id < rhs.id;
else if(!lhs.is_derived)
return true;
else if(!rhs.is_derived)
return false;
return lhs.id < rhs.id;
});
for(const auto& ditr : _counter_info)
{
auto _long_desc = std::string{ ditr.description };
auto _units = std::string{};
auto _pysym = std::string{};
if(ditr.is_constant)
{
continue;
}
else if(ditr.is_derived)
{
auto _sym = JOIN("", ditr.name, _device_qualifier_sym);
auto _short_desc = JOIN("", "Derived counter: ", ditr.expression);
events_info.emplace_back(hardware_counter_info(
true, tim::hardware_counters::api::rocm, events_info.size(), 0,
_sym, _pysym, _short_desc, _long_desc, _units,
qualifier_vec_t{ _device_qualifier }));
}
else
{
auto _dim_info = std::vector<std::string>{};
for(const auto& itr : ditr.dimension_info)
{
auto _info = (itr.instance_size > 1)
? JOIN("", itr.name, "[", 0, ":",
itr.instance_size - 1, "]")
: std::string{};
if(!_info.empty()) _dim_info.emplace_back(_info);
}
auto _sym = JOIN("", ditr.name, _device_qualifier_sym);
auto _short_desc = JOIN("", ditr.name, " on device ", _dev_index);
if(!_dim_info.empty())
{
namespace join = ::timemory::join;
_short_desc += JOIN(
"", ". ",
join::join(join::array_config{ ", ", "", "" }, _dim_info));
}
events_info.emplace_back(hardware_counter_info(
true, tim::hardware_counters::api::rocm, events_info.size(), 0,
_sym, _pysym, _short_desc, _long_desc, _units,
qualifier_vec_t{ _device_qualifier }));
}
}
}
} catch(std::exception& _e)
{
ROCPROFSYS_WARNING_F(1, "Constructing ROCm event info failed: %s\n", _e.what());
}
}
void
client_data::set_agents(agent_vec_t&& _agents_v)
{
agents = std::move(_agents_v);
std::sort(agents.begin(), agents.end(),
[](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; });
cpu_agents.clear();
gpu_agents.clear();
for(const auto& itr : agents)
{
if(itr.type == ROCPROFILER_AGENT_TYPE_CPU)
cpu_agents.emplace_back(tool_agent{ cpu_agents.size(), &itr });
else if(itr.type == ROCPROFILER_AGENT_TYPE_GPU)
gpu_agents.emplace_back(tool_agent{ gpu_agents.size(), &itr });
}
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
+252
ファイルの表示
@@ -0,0 +1,252 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "common/synchronized.hpp"
#include "core/timemory.hpp"
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <memory>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler_sdk
{
using hardware_counter_info = ::tim::hardware_counters::info;
using kernel_symbol_data_t =
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
using kernel_symbol_map_t =
std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data_t>;
using callback_arg_array_t = std::vector<std::pair<std::string, std::string>>;
struct code_object_callback_record_t
{
uint64_t timestamp = 0;
rocprofiler_callback_tracing_record_t record = {};
rocprofiler_callback_tracing_code_object_load_data_t payload = {};
};
struct kernel_symbol_callback_record_t
{
uint64_t timestamp = 0;
rocprofiler_callback_tracing_record_t record = {};
kernel_symbol_data_t payload = {};
};
struct rocprofiler_tool_counter_info_t : rocprofiler_counter_info_v0_t
{
using this_type = rocprofiler_tool_counter_info_t;
using parent_type = rocprofiler_counter_info_v0_t;
using dimension_info_vec_t = std::vector<rocprofiler_record_dimension_info_t>;
rocprofiler_tool_counter_info_t(rocprofiler_agent_id_t _agent_id, parent_type _info,
dimension_info_vec_t&& _dim_info);
rocprofiler_tool_counter_info_t() = default;
~rocprofiler_tool_counter_info_t() = default;
rocprofiler_tool_counter_info_t(const rocprofiler_tool_counter_info_t&) = default;
rocprofiler_tool_counter_info_t(rocprofiler_tool_counter_info_t&&) noexcept = default;
rocprofiler_tool_counter_info_t& operator=(const rocprofiler_tool_counter_info_t&) =
default;
rocprofiler_tool_counter_info_t& operator =(
rocprofiler_tool_counter_info_t&&) noexcept = default;
rocprofiler_agent_id_t agent_id = {};
std::vector<rocprofiler_record_dimension_info_t> dimension_info = {};
};
struct tool_agent
{
uint64_t device_id = 0;
const rocprofiler_agent_v0_t* agent = nullptr;
};
struct timing_interval
{
rocprofiler_timestamp_t start = 0;
rocprofiler_timestamp_t end = 0;
};
using agent_counter_info_map_t =
std::unordered_map<rocprofiler_agent_id_t,
std::vector<rocprofiler_tool_counter_info_t>>;
using agent_counter_profile_map_t =
std::unordered_map<rocprofiler_agent_id_t,
std::optional<rocprofiler_profile_config_id_t>>;
using counter_id_vec_t = std::vector<rocprofiler_counter_id_t>;
using agent_counter_id_map_t =
std::unordered_map<rocprofiler_agent_id_t, counter_id_vec_t>;
using backtrace_operation_map_t =
std::unordered_map<rocprofiler_callback_tracing_kind_t,
std::unordered_set<rocprofiler_tracing_operation_t>>;
struct client_data
{
static constexpr size_t num_buffers = 3;
static constexpr size_t num_contexts = 2;
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
using callback_name_info_t = rocprofiler::sdk::callback_name_info_t<std::string_view>;
using kernel_symbol_vec_t = std::vector<kernel_symbol_callback_record_t*>;
using code_object_vec_t = std::vector<code_object_callback_record_t>;
using buffer_id_vec_t = std::array<rocprofiler_buffer_id_t, num_buffers>;
using context_id_vec_t = std::array<rocprofiler_context_id_t, num_contexts>;
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini = nullptr;
rocprofiler_context_id_t primary_ctx = { 0 };
rocprofiler_context_id_t counter_ctx = { 0 };
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
std::vector<rocprofiler_agent_v0_t> agents = {};
std::vector<tool_agent> cpu_agents = {};
std::vector<tool_agent> gpu_agents = {};
std::vector<hardware_counter_info> events_info = {};
agent_counter_id_map_t agent_events = {};
agent_counter_info_map_t agent_counter_info = {};
agent_counter_profile_map_t agent_counter_profiles = {};
common::synchronized<code_object_vec_t> code_object_records = {};
common::synchronized<kernel_symbol_vec_t> kernel_symbol_records = {};
buffer_name_info_t buffered_tracing_info = {};
callback_name_info_t callback_tracing_info = {};
backtrace_operation_map_t backtrace_operations = {};
void initialize();
void initialize_event_info();
void set_agents(agent_vec_t&& agents);
context_id_vec_t get_contexts() const;
buffer_id_vec_t get_buffers() const;
const rocprofiler_agent_t* get_agent(rocprofiler_agent_id_t _id) const;
const tool_agent* get_gpu_tool_agent(rocprofiler_agent_id_t id) const;
const kernel_symbol_data_t* get_kernel_symbol_info(uint64_t _kernel_id) const;
const rocprofiler_tool_counter_info_t* get_tool_counter_info(
rocprofiler_agent_id_t _agent_id, rocprofiler_counter_id_t _counter_id) const;
};
inline client_data::context_id_vec_t
client_data::get_contexts() const
{
return context_id_vec_t{
primary_ctx,
counter_ctx,
};
}
inline client_data::buffer_id_vec_t
client_data::get_buffers() const
{
return buffer_id_vec_t{
kernel_dispatch_buffer,
memory_copy_buffer,
counter_collection_buffer,
};
}
inline const rocprofiler_agent_t*
client_data::get_agent(rocprofiler_agent_id_t _id) const
{
for(const auto& itr : agents)
if(itr.id == _id) return &itr;
return nullptr;
}
inline const tool_agent*
client_data::get_gpu_tool_agent(rocprofiler_agent_id_t id) const
{
for(const auto& itr : gpu_agents)
if(id == itr.agent->id) return &itr;
return nullptr;
}
inline const kernel_symbol_data_t*
client_data::get_kernel_symbol_info(uint64_t _kernel_id) const
{
return kernel_symbol_records.rlock(
[_kernel_id](const auto& _data) -> const kernel_symbol_data_t* {
for(const auto& itr : _data)
{
if(_kernel_id == itr->payload.kernel_id)
{
return &itr->payload;
break;
}
}
return nullptr;
});
}
inline const rocprofiler_tool_counter_info_t*
client_data::get_tool_counter_info(rocprofiler_agent_id_t _agent_id,
rocprofiler_counter_id_t _counter_id) const
{
for(const auto& itr : agent_counter_info.at(_agent_id))
{
if(itr.id == _counter_id) return &itr;
}
return nullptr;
}
inline constexpr client_data*
as_client_data(void* _ptr)
{
return static_cast<client_data*>(_ptr);
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
#if !defined(ROCPROFILER_CALL)
# define ROCPROFILER_CALL(result) \
{ \
rocprofiler_status_t ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) = \
(result); \
if(ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) != \
ROCPROFILER_STATUS_SUCCESS) \
{ \
auto msg = std::stringstream{}; \
std::string status_msg = rocprofiler_get_status_string( \
ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__)); \
msg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " \
<< "rocprofiler-sdk call [" << #result \
<< "] failed with error code " \
<< ROCPROFSYS_VARIABLE(_rocp_status_, __LINE__) \
<< " :: " << status_msg; \
ROCPROFSYS_WARNING(0, "%s\n", msg.str().c_str()); \
} \
}
#endif
-834
ファイルの表示
@@ -1,834 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/rocprofiler.hpp"
#include "core/common.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/gpu.hpp"
#include "core/perfetto.hpp"
#include "library/rocm.hpp"
#include "library/rocm/hsa_rsrc_factory.hpp"
#include <timemory/backends/hardware_counters.hpp>
#include <timemory/manager.hpp>
#include <timemory/mpl/concepts.hpp>
#include <timemory/storage/types.hpp>
#include <timemory/utility/types.hpp>
#include <rocprofiler.h>
#include <atomic>
#include <cstdlib>
#include <dlfcn.h>
#include <hsa.h>
#include <iostream>
#include <mutex>
#include <sstream>
#include <string.h>
#include <string_view>
#include <type_traits>
#include <unistd.h>
#include <vector>
namespace rocprofsys
{
namespace rocprofiler
{
namespace
{
using ::rocprofiler::util::AgentInfo;
using ::rocprofiler::util::HsaRsrcFactory;
auto&
get_event_names()
{
static auto _v = std::map<uint32_t, std::vector<rocprofiler_feature_t>>{};
return _v;
}
} // namespace
// Error handler
void
fatal(const std::string& msg)
{
ROCPROFSYS_PRINT_F("\n");
ROCPROFSYS_PRINT_F("%s\n", msg.c_str());
abort();
}
// Check returned HSA API status
const char*
rocm_error_string(hsa_status_t _status)
{
const char* _err_string = nullptr;
if(_status != HSA_STATUS_SUCCESS) rocprofiler_error_string(&_err_string);
return _err_string;
}
// Check returned HSA API status
bool
rocm_check_status(hsa_status_t _status, const std::set<hsa_status_t>& _nonfatal = {})
{
if(_status != HSA_STATUS_SUCCESS)
{
if(_nonfatal.count(_status) == 0)
fatal(JOIN(" :: ", "ERROR", rocm_error_string(_status)));
ROCPROFSYS_PRINT_F("Warning! %s\n", rocm_error_string(_status));
return false;
}
return true;
}
// Context stored entry type
struct context_entry_t
{
bool valid;
hsa_agent_t agent;
rocprofiler_group_t group;
rocprofiler_callback_data_t data;
};
// Context callback arg
struct callbacks_arg_t
{
rocprofiler_pool_t** pools;
};
// Handler callback arg
struct handler_arg_t
{
rocprofiler_feature_t* features;
unsigned feature_count;
};
bool&
is_setup()
{
static bool _v = false;
return _v;
}
std::map<uint32_t, std::vector<std::string_view>>
get_data_labels()
{
auto _v = std::map<uint32_t, std::vector<std::string_view>>{};
for(const auto& itr : get_event_names())
{
_v[itr.first] = {};
for(auto vitr : itr.second)
_v[itr.first].emplace_back(std::string_view{ vitr.name });
}
return _v;
}
// Dump stored context entry
void
rocm_dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features,
unsigned feature_count)
{
volatile std::atomic<bool>* valid =
reinterpret_cast<std::atomic<bool>*>(&entry->valid);
while(valid->load() == false)
sched_yield();
const rocprofiler_dispatch_record_t* record = entry->data.record;
if(!record) return; // there is nothing to do here.
auto _queue_id = entry->data.queue_id;
auto _thread_id = entry->data.thread_id;
auto _dev_id = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index;
auto _kernel_name = std::string{ entry->data.kernel_name };
auto _pos = _kernel_name.find_last_of(')');
if(_pos != std::string::npos) _kernel_name = _kernel_name.substr(0, _pos + 1);
rocprofiler_group_t& group = entry->group;
if(group.context == nullptr)
{
fatal("context is nullptr\n");
}
if(feature_count > 0)
{
rocm_check_status(rocprofiler_group_get_data(&group));
rocm_check_status(rocprofiler_get_metrics(group.context));
}
auto _evt =
component::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name,
record->begin, record->end, feature_count, features };
component::rocm_data()->emplace_back(_evt);
}
// Profiling completion handler
// Dump and delete the context entry
// Return true if the context was dumped successfully
bool
rocm_context_handler(const rocprofiler_pool_entry_t* entry, void* arg)
{
// Context entry
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
// rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
// if(!_lk.owns_lock()) _lk.lock();
rocm_dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count);
return true;
}
// Kernel disoatch callback
hsa_status_t
rocm_dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg,
rocprofiler_group_t* group)
{
// Passed tool data
hsa_agent_t agent = callback_data->agent;
// Open profiling context
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(arg);
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
rocprofiler_pool_entry_t pool_entry{};
rocm_check_status(rocprofiler_pool_fetch(pool, &pool_entry));
// Profiling context entry
rocprofiler_t* context = pool_entry.context;
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
// Get group[0]
rocm_check_status(rocprofiler_get_group(context, 0, group));
// Fill profiling context entry
entry->agent = agent;
entry->group = *group;
entry->data = *callback_data;
entry->data.kernel_name = strdup(callback_data->kernel_name);
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
return HSA_STATUS_SUCCESS;
}
unsigned
metrics_input(unsigned _device, rocprofiler_feature_t** ret)
{
// Profiling feature objects
auto _events = tim::delimit(config::get_rocm_events(), ", ;\t\n");
std::vector<std::string> _features = {};
auto _this_device = JOIN("", ":device=", _device);
for(auto itr : _events)
{
ROCPROFSYS_VERBOSE_F(3, "Processing feature '%s' for device %u...\n", itr.c_str(),
_device);
auto _pos = itr.find(":device=");
if(_pos != std::string::npos)
{
if(itr.find(_this_device) != std::string::npos)
{
_features.emplace_back(itr.substr(0, _pos));
}
}
else
{
_features.emplace_back(itr);
}
}
const unsigned feature_count = _features.size();
rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count];
memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
// PMC events
for(unsigned i = 0; i < feature_count; ++i)
{
ROCPROFSYS_VERBOSE_F(3, "Adding feature '%s' for device %u...\n",
_features.at(i).c_str(), _device);
features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC;
features[i].name = strdup(_features.at(i).c_str());
features[i].parameters = nullptr;
features[i].parameter_count = 0;
}
*ret = features;
return feature_count;
}
using info_data = std::vector<component::rocm_info_entry>;
hsa_status_t
info_data_callback(const rocprofiler_info_data_t info, void* arg)
{
using qualifier_t = tim::hardware_counters::qualifier;
using qualifier_vec_t = std::vector<qualifier_t>;
auto* _data = static_cast<info_data*>(arg);
auto _dev_index = info.agent_index;
switch(info.kind)
{
case ROCPROFILER_INFO_KIND_METRIC:
{
auto _device_qualifier_sym = JOIN("", ":device=", _dev_index);
auto _device_qualifier =
tim::hardware_counters::qualifier{ true, static_cast<int>(_dev_index),
_device_qualifier_sym,
JOIN(" ", "Device", _dev_index) };
auto _long_desc = std::string{ info.metric.description };
auto _units = std::string{};
auto _pysym = std::string{};
if(info.metric.expr != nullptr)
{
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
auto _short_desc = JOIN("", "Derived counter: ", info.metric.expr);
_data->emplace_back(component::rocm_info_entry(
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
_pysym, _short_desc, _long_desc, _units,
qualifier_vec_t{ _device_qualifier }));
}
else
{
if(info.metric.instances == 1)
{
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
auto _short_desc =
JOIN("", info.metric.name, " on device ", _dev_index);
_data->emplace_back(component::rocm_info_entry(
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
_pysym, _short_desc, _long_desc, _units,
qualifier_vec_t{ _device_qualifier }));
}
else
{
for(uint32_t i = 0; i < info.metric.instances; ++i)
{
auto _instance_qualifier_sym = JOIN("", '[', i, ']');
auto _instance_qualifier =
tim::hardware_counters::qualifier{ true, static_cast<int>(i),
_instance_qualifier_sym,
JOIN(" ", "Instance", i) };
auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym,
_device_qualifier_sym);
auto _short_desc = JOIN("", info.metric.name, " instance ", i,
" on device ", _dev_index);
_data->emplace_back(component::rocm_info_entry(
true, tim::hardware_counters::api::rocm, _data->size(), 0,
_sym, _pysym, _short_desc, _long_desc, _units,
qualifier_vec_t{ _device_qualifier, _instance_qualifier }));
}
}
}
break;
}
default: printf("wrong info kind %u\n", info.kind); return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
std::vector<component::rocm_info_entry>
rocm_metrics()
{
std::vector<component::rocm_info_entry> _data = {};
try
{
(void) HsaRsrcFactory::Instance();
} catch(std::runtime_error& _e)
{
ROCPROFSYS_VERBOSE_F(0, "%s\n", _e.what());
return _data;
}
// Available GPU agents
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
std::vector<AgentInfo*> _gpu_agents(gpu_count, nullptr);
for(unsigned i = 0; i < gpu_count; ++i)
{
const AgentInfo* _agent = _gpu_agents[i];
const AgentInfo** _agent_p = &_agent;
HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p);
if(!rocm_check_status(rocprofiler_iterate_info(
&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC,
info_data_callback, reinterpret_cast<void*>(&_data)),
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
{
ROCPROFSYS_WARNING_F(-1, "rocprofiler_iterate_info failed for gpu agent %u\n",
i);
}
}
if(gpu_count > 0 && _data.empty())
{
if(!rocm_check_status(rocprofiler_iterate_info(
nullptr, ROCPROFILER_INFO_KIND_METRIC,
info_data_callback, reinterpret_cast<void*>(&_data)),
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
{
ROCPROFSYS_WARNING_F(
-1, "rocprofiler_iterate_info failed for %i gpu agents\n", gpu_count);
}
}
auto _settings = tim::settings::shared_instance();
if(_settings)
{
auto ritr = _settings->find("ROCPROFSYS_ROCM_EVENTS");
if(ritr != _settings->end())
{
auto _rocm_events = ritr->second;
if(_rocm_events->get_choices().empty())
{
std::vector<std::string> _choices = {};
_choices.reserve(_data.size());
for(auto itr : _data)
{
if(!itr.symbol().empty()) _choices.emplace_back(itr.symbol());
}
_rocm_events->set_choices(_choices);
}
}
}
return _data;
}
void
rocm_initialize()
{
// Available GPU agents
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
(void) rocm_metrics();
// Adding dispatch observer
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count];
for(unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++)
{
// Getting profiling features
rocprofiler_feature_t* features = nullptr;
unsigned feature_count = metrics_input(gpu_id, &features);
if(features)
{
get_event_names()[gpu_id].clear();
get_event_names()[gpu_id].reserve(feature_count);
for(unsigned i = 0; i < feature_count; ++i)
get_event_names().at(gpu_id).emplace_back(features[i]);
}
// Handler arg
handler_arg_t* handler_arg = new handler_arg_t{};
handler_arg->features = features;
handler_arg->feature_count = feature_count;
// Context properties
rocprofiler_pool_properties_t properties{};
properties.num_entries = 100;
properties.payload_bytes = sizeof(context_entry_t);
properties.handler = rocm_context_handler;
properties.handler_arg = handler_arg;
// Getting GPU device info
const AgentInfo* agent_info = nullptr;
if(HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false)
{
fprintf(stderr, "GetGpuAgentInfo failed\n");
abort();
}
// Open profiling pool
rocprofiler_pool_t* pool = nullptr;
uint32_t mode = 0; // ROCPROFILER_MODE_SINGLEGROUP
rocm_check_status(rocprofiler_pool_open(agent_info->dev_id, features,
feature_count, &pool, mode, &properties));
callbacks_arg->pools[gpu_id] = pool;
}
rocprofiler_queue_callbacks_t callbacks_ptrs{};
callbacks_ptrs.dispatch = rocm_dispatch_callback;
int err = rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
ROCPROFSYS_VERBOSE_F(3, "err=%d, rocprofiler_set_queue_callbacks\n", err);
is_setup() = true;
}
void
rocm_cleanup()
{
// Unregister dispatch callback
rocm_check_status(rocprofiler_remove_queue_callbacks());
// close profiling pool
// rocm_check_status(rocprofiler_pool_flush(pool));
// rocm_check_status(rocprofiler_pool_close(pool));
}
namespace
{
using rocm_event = component::rocm_event;
using rocm_data_t = component::rocm_data_t;
using rocm_metric_type = component::rocm_metric_type;
using rocm_feature_value = component::rocm_feature_value;
using rocm_data_tracker = component::rocm_data_tracker;
void
post_process_perfetto()
{
using counter_track = perfetto_counter_track<rocm_event>;
static bool _once = false;
if(_once) return;
auto _data = rocm_data_t{};
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i)
{
auto& _v = component::rocm_data(i);
if(_v)
{
_data.reserve(_data.size() + _v->size());
for(auto& itr : *_v)
_data.emplace_back(itr);
}
}
if(_data.empty()) return;
_once = true;
std::sort(_data.begin(), _data.end());
auto _get_events = [](std::vector<rocm_event*>& _inp, rocm_metric_type _ts) {
auto _v = std::vector<rocm_event*>{};
for(const auto& itr : _inp)
{
if(_ts >= itr->entry && _ts <= itr->exit) _v.emplace_back(itr);
if(_ts > itr->exit) break;
}
return _v;
};
{
auto _device_time = std::map<uint32_t, std::set<rocm_metric_type>>{};
for(auto& itr : _data)
{
_device_data[itr.device_id].emplace_back(&itr);
_device_time[itr.device_id].emplace(itr.entry);
_device_time[itr.device_id].emplace(itr.exit);
auto _dev_id = itr.device_id;
if(get_use_perfetto() && !counter_track::exists(_dev_id))
{
auto addendum = [&](auto&& _v) {
return JOIN(" ", "Device", _v, JOIN("", '[', _dev_id, ']'));
};
for(auto nitr : itr.feature_names)
{
auto _name = get_data_labels().at(itr.device_id).at(nitr);
counter_track::emplace(_dev_id, addendum(_name));
}
}
}
for(auto& ditr : _device_time)
{
for(auto itr = ditr.second.begin(); itr != ditr.second.end(); ++itr)
{
auto _next = std::next(itr);
if(_next == ditr.second.end()) continue;
_device_range[ditr.first].emplace(((*_next / 2) + (*itr / 2)));
}
}
}
for(auto& ditr : _device_range)
{
auto _dev_id = ditr.first;
auto _values = std::vector<rocm_feature_value>{};
auto _ts_sorted_data = _device_data[_dev_id];
std::sort(_ts_sorted_data.begin(), _ts_sorted_data.end(),
[](auto* _l, auto* _r) { return _l->exit < _r->exit; });
for(const auto& itr : ditr.second)
{
auto _v = _get_events(_ts_sorted_data, itr);
uint64_t _ts = itr;
for(auto* vitr : _v)
{
size_t _n = vitr->feature_values.size();
if(_values.empty())
{
_values.reserve(_n);
for(size_t i = 0; i < _n; ++i)
{
_values.emplace_back(vitr->feature_values.at(i));
}
}
else
{
for(size_t i = 0; i < _n; ++i)
{
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
auto _plus = [](auto& _lhs, auto&& _rhs) { _lhs += _rhs; };
std::visit(_plus, _values.at(i), vitr->feature_values.at(i));
#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
}
}
}
for(size_t i = 0; i < _values.size(); ++i)
{
auto _trace_counter = [_dev_id, i, _ts](auto&& _val) {
TRACE_COUNTER("kernel_hardware_counter",
counter_track::at(_dev_id, i), _ts, _val);
};
std::visit(_trace_counter, _values.at(i));
}
}
}
}
void
post_process_timemory()
{
static bool _once = false;
if(_once) return;
auto _data = rocm_data_t{};
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
for(size_t i = 0; i < ROCPROFSYS_MAX_THREADS; ++i)
{
auto& _v = component::rocm_data(i);
if(_v)
{
_data.reserve(_data.size() + _v->size());
for(auto& itr : *_v)
_data.emplace_back(itr);
}
}
if(_data.empty()) return;
_once = true;
std::sort(_data.begin(), _data.end());
for(auto& itr : _data)
{
_device_data[itr.device_id].emplace_back(&itr);
}
for(auto& itr : _device_data)
{
// sort according to when it exited
std::sort(itr.second.begin(), itr.second.end(),
[](auto* _lhs, auto* _rhs) { return _lhs->exit < _rhs->exit; });
}
using storage_type = typename rocm_data_tracker::storage_type;
using bundle_type = tim::lightweight_tuple<rocm_data_tracker>;
auto _info = rocm_metrics();
static auto _get_description = [&_info](std::string_view _v) {
for(auto& itr : _info)
{
if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0)
{
return itr.long_description();
}
}
return std::string{};
};
struct local_event
{
rocm_event* parent = nullptr;
mutable std::vector<local_event> children = {};
ROCPROFSYS_DEFAULT_OBJECT(local_event)
explicit local_event(rocm_event* _v)
: parent{ _v }
{}
bool operator()(rocm_event* _v)
{
if(!parent) return false;
if(_v->device_id != parent->device_id) return false;
if(_v->entry > parent->entry && _v->exit <= parent->exit)
{
children.emplace_back(_v);
return true;
}
return false;
}
bool operator<(const local_event& _v) const
{
if(!parent && _v.parent) return true;
if(parent && !_v.parent) return false;
return *parent < *_v.parent;
}
void operator()(int64_t _index, scope::config _scope) const
{
if(!parent) return;
bundle_type _bundle{ parent->name, _scope };
_bundle.push(parent->queue_id)
.start()
.store(parent->feature_values.at(_index));
std::sort(children.begin(), children.end());
for(const auto& itr : children)
itr(_index, _scope);
_bundle.stop().pop(parent->queue_id);
}
};
struct local_storage
{
int64_t index = 0;
std::string metric_name = {};
std::string metric_description = {};
std::unique_ptr<storage_type> storage = {};
local_storage(uint32_t _devid, size_t _idx, std::string_view _name)
: index{ static_cast<int64_t>(_idx) }
, metric_name{ _name }
, metric_description{ _get_description(metric_name) }
{
auto _metric_name = std::string{ _name };
_metric_name = std::regex_replace(
_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2");
storage = std::make_unique<storage_type>(
tim::standalone_storage{}, index,
JOIN('-', "rocprof", "device", _devid, _metric_name));
}
void operator()(const local_event& _event, scope::config _scope) const
{
operation::set_storage<rocm_data_tracker>{}(storage.get());
_event(index, _scope);
}
void write() const
{
rocm_data_tracker::label() = metric_name;
rocm_data_tracker::description() = metric_description;
storage->write();
}
};
auto _local_data = std::map<uint32_t, std::vector<local_event>>{};
auto _scope = scope::get_default();
for(auto& ditr : _device_data)
{
ROCPROFSYS_VERBOSE_F(1, "Post-processing %zu entries for device %u...\n",
ditr.second.size(), ditr.first);
auto _storage = std::vector<local_storage>{};
for(auto& itr : ditr.second)
{
auto _n = itr->feature_names.size();
if(_n > _storage.size())
{
_storage.reserve(_n);
for(size_t i = _storage.size(); i < _n; ++i)
_storage.emplace_back(
ditr.first, i,
get_data_labels().at(ditr.first).at(itr->feature_names.at(i)));
}
}
auto& _local = _local_data[ditr.first];
_local.reserve(ditr.second.size());
double _avg = 0.0;
for(auto& itr : ditr.second)
{
if(_local.empty() || itr->entry >= _local.back().parent->exit)
{
_local.emplace_back(itr);
}
else
{
size_t _n = 0;
bool _found = false;
for(auto litr = _local.rbegin(); litr != _local.rend(); ++litr)
{
++_n;
if((*litr)(itr))
{
_found = true;
break;
}
}
if(!_found) _local.emplace_back(itr);
_avg += _n;
}
}
ROCPROFSYS_VERBOSE_F(3, "Average # of iterations before match: %.1f\n",
_avg / ditr.second.size() * 100.0);
for(auto& sitr : _storage)
{
for(auto& itr : _local)
sitr(itr, _scope);
}
for(auto& itr : _storage)
itr.write();
}
tim::trait::runtime_enabled<rocprofsys::rocprofiler::rocm_data_tracker>::set(false);
}
} // namespace
void
post_process()
{
if(get_use_perfetto()) post_process_perfetto();
if(get_use_timemory())
{
auto _manager = tim::manager::master_instance();
if(_manager)
{
_manager->add_cleanup("rocprofiler", &post_process_timemory);
}
else
{
post_process_timemory();
}
}
}
} // namespace rocprofiler
} // namespace rocprofsys
-967
ファイルの表示
@@ -1,967 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/roctracer.hpp"
#include "binary/analysis.hpp"
#include "core/components/fwd.hpp"
#include "core/concepts.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/locking.hpp"
#include "library/components/category_region.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "library/thread_info.hpp"
#include "library/tracing.hpp"
#include <timemory/backends/cpu.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/hash/types.hpp>
#include <timemory/utility/types.hpp>
#include <atomic>
#include <chrono>
#include <cstdint>
#include <tuple>
#include <roctracer_ext.h>
#include <roctracer_hip.h>
#include <roctracer_roctx.h>
#if ROCPROFSYS_HIP_VERSION < 50300
# include <roctracer_hcc.h>
#endif
#define AMD_INTERNAL_BUILD 1
#include <roctracer_hsa.h>
#if __has_include(<hip/amd_detail/hip_prof_str.h>) || (defined(ROCPROFSYS_USE_HIP) && ROCPROFSYS_USE_HIP > 0)
# include <hip/amd_detail/hip_prof_str.h>
# define ROCPROFSYS_HIP_API_ARGS 1
#else
# define ROCPROFSYS_HIP_API_ARGS 0
#endif
TIMEMORY_DEFINE_API(roctracer)
namespace rocprofsys
{
namespace
{
template <typename Tp>
auto&
roctracer_type_mutex()
{
return tim::type_mutex<Tp, category::roctracer, max_supported_threads,
locking::atomic_mutex>();
}
std::string
hip_api_string(hip_api_id_t id, const hip_api_data_t* data)
{
#if ROCPROFSYS_HIP_API_ARGS > 0
std::string _v = hipApiString(id, data);
if(_v.empty()) return _v;
auto _pbeg = _v.find('(');
if(_pbeg == std::string::npos) return _v;
auto _pend = _v.find_last_of(')');
if(_pend == std::string::npos || _pbeg >= _pend) return _v;
auto _n = (_pend - _pbeg - 1);
return _v.substr(_pbeg + 1, _n);
#else
tim::consume_parameters(id, data);
#endif
}
int&
get_current_device()
{
static thread_local int _v = 1;
return _v;
}
std::unordered_set<uint64_t>&
get_roctracer_kernels()
{
static auto _v = std::unordered_set<uint64_t>{};
return _v;
}
auto&
get_roctracer_hip_data(int64_t _tid = threading::get_id())
{
using data_t = std::unordered_map<uint64_t, roctracer_hip_bundle_t>;
using thread_data_t = thread_data<data_t, category::roctracer>;
return thread_data_t::instance(construct_on_thread{ _tid });
}
std::unordered_map<uint64_t, const char*>&
get_roctracer_key_data()
{
static auto _v = std::unordered_map<uint64_t, const char*>{};
return _v;
}
std::unordered_map<uint64_t, int64_t>&
get_roctracer_tid_data()
{
static auto _v = std::unordered_map<uint64_t, int64_t>{};
return _v;
}
auto&
get_hip_activity_callbacks(int64_t _tid = threading::get_id())
{
using thread_data_t =
thread_data<std::vector<std::function<void()>>, category::roctracer>;
return thread_data_t::instance(construct_on_thread{ _tid });
}
size_t
get_hip_activity_callbacks_size()
{
using thread_data_t =
thread_data<std::vector<std::function<void()>>, category::roctracer>;
return thread_data_t::size();
}
using hip_activity_mutex_t = std::decay_t<decltype(get_hip_activity_callbacks())>;
using key_data_mutex_t = std::decay_t<decltype(get_roctracer_key_data())>;
auto&
get_hip_activity_mutex(int64_t _tid = threading::get_id())
{
return tim::type_mutex<hip_activity_mutex_t, category::roctracer,
max_supported_threads, locking::atomic_mutex>(
_tid % max_supported_threads);
}
} // namespace
//
int64_t
get_clock_skew()
{
static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true);
if(!_use) return 0;
static auto _v = []() {
auto _gpu_now = []() {
uint64_t _ts = 0;
roctracer_get_timestamp(&_ts);
return _ts;
};
// discard (warm-up)
(void) tracing::get_clock_skew(_gpu_now, 1);
auto _diff = tracing::get_clock_skew(_gpu_now, 10);
ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff,
_use ? "yes" : "no");
return _diff;
}();
return _v;
}
// HSA API callback function
void
hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg)
{
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
(void) arg;
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
ROCPROFSYS_CONDITIONAL_PRINT_F(
get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n",
roctracer_op_string(domain, cid, 0), cid, data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
static thread_local int64_t begin_timestamp = 0;
switch(cid)
{
case HSA_API_ID_hsa_init:
case HSA_API_ID_hsa_shut_down:
case HSA_API_ID_hsa_agent_get_exception_policies:
case HSA_API_ID_hsa_agent_get_info:
case HSA_API_ID_hsa_amd_agent_iterate_memory_pools:
case HSA_API_ID_hsa_amd_agent_memory_pool_get_info:
case HSA_API_ID_hsa_amd_coherency_get_type:
case HSA_API_ID_hsa_amd_memory_pool_get_info:
case HSA_API_ID_hsa_amd_pointer_info:
case HSA_API_ID_hsa_amd_pointer_info_set_userdata:
case HSA_API_ID_hsa_amd_profiling_async_copy_enable:
case HSA_API_ID_hsa_amd_profiling_get_async_copy_time:
case HSA_API_ID_hsa_amd_profiling_get_dispatch_time:
case HSA_API_ID_hsa_amd_profiling_set_profiler_enabled:
case HSA_API_ID_hsa_cache_get_info:
case HSA_API_ID_hsa_code_object_get_info:
case HSA_API_ID_hsa_code_object_get_symbol:
case HSA_API_ID_hsa_code_object_get_symbol_from_name:
case HSA_API_ID_hsa_code_object_reader_create_from_memory:
case HSA_API_ID_hsa_code_symbol_get_info:
case HSA_API_ID_hsa_executable_create_alt:
case HSA_API_ID_hsa_executable_freeze:
case HSA_API_ID_hsa_executable_get_info:
case HSA_API_ID_hsa_executable_get_symbol:
case HSA_API_ID_hsa_executable_get_symbol_by_name:
case HSA_API_ID_hsa_executable_symbol_get_info:
case HSA_API_ID_hsa_extension_get_name:
case HSA_API_ID_hsa_ext_image_data_get_info:
case HSA_API_ID_hsa_ext_image_data_get_info_with_layout:
case HSA_API_ID_hsa_ext_image_get_capability:
case HSA_API_ID_hsa_ext_image_get_capability_with_layout:
case HSA_API_ID_hsa_isa_get_exception_policies:
case HSA_API_ID_hsa_isa_get_info:
case HSA_API_ID_hsa_isa_get_info_alt:
case HSA_API_ID_hsa_isa_get_round_method:
case HSA_API_ID_hsa_region_get_info:
case HSA_API_ID_hsa_system_extension_supported:
case HSA_API_ID_hsa_system_get_extension_table:
case HSA_API_ID_hsa_system_get_info:
case HSA_API_ID_hsa_system_get_major_extension_table:
case HSA_API_ID_hsa_wavefront_get_info: break;
default:
{
if(data->phase == ACTIVITY_API_PHASE_ENTER)
{
begin_timestamp = comp::wall_clock::record();
}
else
{
const auto* _name = roctracer_op_string(domain, cid, 0);
const auto end_timestamp = (cid == HSA_API_ID_hsa_shut_down)
? begin_timestamp
: comp::wall_clock::record();
if(begin_timestamp > end_timestamp) return;
if(get_use_perfetto())
{
uint64_t _beg_ts = begin_timestamp;
uint64_t _end_ts = end_timestamp;
tracing::push_perfetto_ts(category::rocm_hsa{}, _name, _beg_ts,
[&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(
ctx, "begin_ns", _beg_ts);
}
});
tracing::pop_perfetto_ts(category::rocm_hsa{}, _name, _end_ts,
[&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(
ctx, "end_ns", _end_ts);
}
});
}
if(get_use_timemory())
{
auto _beg_ns = begin_timestamp;
auto _end_ns = end_timestamp;
if(tasking::roctracer::get_task_group().pool())
tasking::roctracer::get_task_group().exec(
[_name, _beg_ns, _end_ns]() {
roctracer_hsa_bundle_t _bundle{ _name };
_bundle.start()
.store(std::plus<double>{},
static_cast<double>(_end_ns - _beg_ns))
.stop();
});
}
// timemory is disabled in this callback because collecting data in this
// thread causes strange segmentation faults
}
}
}
}
void
hsa_activity_callback(uint32_t op, const void* vrecord, void* arg)
{
const auto* record = static_cast<const activity_record_t*>(vrecord);
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
auto&& _protect = comp::roctracer::protect_flush_activity();
(void) _protect;
static const char* copy_op_name = "hsa_async_copy";
static const char* dispatch_op_name = "hsa_dispatch";
static const char* barrier_op_name = "hsa_barrier";
const char** _name = nullptr;
switch(op)
{
case HSA_OP_ID_DISPATCH: _name = &dispatch_op_name; break;
case HSA_OP_ID_COPY: _name = &copy_op_name; break;
case HSA_OP_ID_BARRIER: _name = &barrier_op_name; break;
default: break;
}
ROCPROFSYS_CI_FAIL(_name == nullptr, "Error! HSA operation type not handled: %u\n",
op);
if(!_name) return;
auto _beg_ns = record->begin_ns + get_clock_skew();
auto _end_ns = record->end_ns + get_clock_skew();
if(get_use_perfetto())
{
uint64_t _beg = _beg_ns;
uint64_t _end = _end_ns;
tracing::push_perfetto_ts(
category::device_hsa{}, *_name, _beg, [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg);
}
});
tracing::pop_perfetto_ts(
category::device_hsa{}, *_name, _end, [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "end_ns", _end);
}
});
}
auto _func = [_beg_ns, _end_ns, _name]() {
if(get_use_timemory())
{
roctracer_hsa_bundle_t _bundle{ *_name };
_bundle.start()
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
.stop();
}
};
if(tasking::roctracer::get_task_group().pool())
tasking::roctracer::get_task_group().exec(_func);
// timemory is disabled in this callback because collecting data in this thread
// causes strange segmentation faults
tim::consume_parameters(arg);
}
void
hip_exec_activity_callbacks(int64_t _tid)
{
// guard against initialization of structure when trying to exec
if(static_cast<size_t>(_tid) >= get_hip_activity_callbacks_size()) return;
// ROCPROFSYS_ROCTRACER_CALL(roctracer_flush_activity());
locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) };
auto& _async_ops = get_hip_activity_callbacks(_tid);
if(!_async_ops) return;
for(auto& itr : *_async_ops)
{
if(itr) itr();
}
_async_ops->clear();
}
namespace
{
thread_local std::unordered_map<size_t, size_t> gpu_crit_cids = {};
}
void
roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data,
void* /*arg*/)
{
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
if(domain != ACTIVITY_DOMAIN_ROCTX) return;
static auto _range_map = std::unordered_map<roctx_range_id_t, std::string>{};
static auto _range_lock = locking::atomic_mutex{};
const auto* _data = reinterpret_cast<const roctx_api_data_t*>(callback_data);
static thread_local auto _range_stack = std::vector<std::string>{};
switch(cid)
{
case ROCTX_API_ID_roctxRangePushA:
{
if(_data->args.message)
{
auto& itr = _range_stack.emplace_back(std::string{ _data->args.message });
component::category_region<category::rocm_roctx>::start(itr.c_str());
}
break;
}
case ROCTX_API_ID_roctxRangePop:
{
if(!_range_stack.empty())
{
auto& itr = _range_stack.back();
component::category_region<category::rocm_roctx>::stop(itr.c_str());
_range_stack.pop_back();
}
else
{
ROCPROFSYS_THROW("Error! roctxRangePop stack is empty! Expected "
"roctxRangePush/roctxRangePop on same thread\n");
}
break;
}
case ROCTX_API_ID_roctxRangeStartA:
{
{
locking::atomic_lock _lk{ _range_lock, std::defer_lock };
if(!_lk.owns_lock()) _lk.lock();
_range_map.emplace(roctx_range_id_t{ _data->args.id },
std::string{ _data->args.message });
}
component::category_region<category::rocm_roctx>::start(_data->args.message);
break;
}
case ROCTX_API_ID_roctxRangeStop:
{
std::string_view _message = {};
{
locking::atomic_lock _lk{ _range_lock, std::defer_lock };
if(!_lk.owns_lock()) _lk.lock();
auto itr = _range_map.find(roctx_range_id_t{ _data->args.id });
ROCPROFSYS_CI_THROW(itr == _range_map.end(),
"Error! could not find range with id %lu\n",
_data->args.id);
if(itr == _range_map.end())
{
ROCPROFSYS_VERBOSE(0, "Warning! could not find range with id %lu\n",
_data->args.id);
return;
}
else
{
_message = itr->second;
}
}
if(!_message.empty())
{
component::category_region<category::rocm_roctx>::stop(_message.data());
}
break;
}
case ROCTX_API_ID_roctxMarkA:
{
if(_data->args.message)
{
component::category_region<category::rocm_roctx>::mark(
_data->args.message);
}
break;
}
default: break;
}
}
// HIP API callback function
void
hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg)
{
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
assert(domain == ACTIVITY_DOMAIN_HIP_API);
const char* op_name = roctracer_op_string(domain, cid, 0);
if(op_name == nullptr) op_name = hip_api_name(cid);
if(op_name == nullptr) return;
assert(std::string{ op_name } == std::string{ hip_api_name(cid) });
switch(cid)
{
case HIP_API_ID___hipPushCallConfiguration:
case HIP_API_ID___hipPopCallConfiguration:
case HIP_API_ID_hipDeviceEnablePeerAccess:
#if ROCPROFSYS_HIP_VERSION_MAJOR > 4 || \
(ROCPROFSYS_HIP_VERSION_MAJOR == 4 && ROCPROFSYS_HIP_VERSION_MINOR >= 3)
case HIP_API_ID_hipImportExternalMemory:
case HIP_API_ID_hipDestroyExternalMemory:
#endif
return;
default: break;
}
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
ROCPROFSYS_CONDITIONAL_PRINT_F(
get_debug() && get_verbose() >= 2, "<%-30s id(%u)\tcorrelation_id(%lu) %s>\n",
op_name, cid, data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
int64_t _ts = comp::wall_clock::record();
auto _tid = threading::get_id();
uint64_t _crit_cid = 0;
uint64_t _parent_crit_cid = 0;
uint32_t _depth = 0;
auto _roct_cid = data->correlation_id;
auto& _device_id = get_current_device();
if(data->phase == ACTIVITY_API_PHASE_ENTER)
{
if(cid == HIP_API_ID_hipSetDevice)
get_current_device() =
reinterpret_cast<int>(data->args.hipSetDevice.deviceId) + 1;
const char* _name = nullptr;
switch(cid)
{
case HIP_API_ID_hipLaunchKernel:
{
_name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
break;
}
case HIP_API_ID_hipLaunchCooperativeKernel:
{
_name =
hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f,
data->args.hipLaunchCooperativeKernel.stream);
if(!_name)
{
_name =
hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
}
break;
}
case HIP_API_ID_hipHccModuleLaunchKernel:
{
_name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f);
break;
}
case HIP_API_ID_hipModuleLaunchKernel:
{
_name = hipKernelNameRef(data->args.hipModuleLaunchKernel.f);
break;
}
case HIP_API_ID_hipExtModuleLaunchKernel:
{
_name = hipKernelNameRef(data->args.hipExtModuleLaunchKernel.f);
break;
}
case HIP_API_ID_hipExtLaunchKernel:
{
_name =
hipKernelNameRefByPtr(data->args.hipExtLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
break;
}
default: break;
}
if(_name != nullptr)
{
if(get_use_perfetto() || get_use_timemory() || get_use_rocm_smi())
{
locking::atomic_lock _lk{ roctracer_type_mutex<key_data_mutex_t>() };
get_roctracer_key_data().emplace(_roct_cid, _name);
get_roctracer_tid_data().emplace(_roct_cid, _tid);
}
}
std::tie(_crit_cid, _parent_crit_cid, _depth) = create_cpu_cid_entry();
if(get_use_perfetto())
{
static auto _compact_annotations =
config::get_setting_value<bool>(
"ROCPROFSYS_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS")
.value_or(false);
static auto _enable_backtraces =
config::get_setting_value<bool>("ROCPROFSYS_ROCTRACER_HIP_API_BACKTRACE")
.value_or(false);
constexpr size_t bt_stack_depth = 16;
constexpr size_t bt_ignore_depth = 3;
constexpr bool bt_with_signal_frame = true;
using backtrace_entry_vec_t = std::vector<tim::unwind::processed_entry>;
auto _bt_data = std::optional<backtrace_entry_vec_t>{};
if(_enable_backtraces && config::get_perfetto_annotations())
{
auto _backtrace = tim::get_unw_stack<bt_stack_depth, bt_ignore_depth,
bt_with_signal_frame>();
_bt_data = backtrace_entry_vec_t{};
_bt_data->reserve(_backtrace.size());
for(auto itr : _backtrace)
{
if(itr)
{
if(auto _val = binary::lookup_ipaddr_entry<false>(itr->address());
_val)
{
_bt_data->emplace_back(std::move(*_val));
}
}
}
}
auto _api_id = static_cast<hip_api_id_t>(cid);
tracing::push_perfetto_ts(
category::rocm_hip{}, op_name, _ts,
::perfetto::Flow::ProcessScoped(_roct_cid),
[&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _ts);
tracing::add_perfetto_annotation(ctx, "cid", _crit_cid);
tracing::add_perfetto_annotation(ctx, "pcid", _parent_crit_cid);
tracing::add_perfetto_annotation(ctx, "device", _device_id);
tracing::add_perfetto_annotation(ctx, "tid", _tid);
tracing::add_perfetto_annotation(ctx, "depth", _depth);
tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid);
if(_compact_annotations)
{
tracing::add_perfetto_annotation(
ctx, "args", hip_api_string(_api_id, data));
}
else
{
auto _args = std::string{ hip_api_string(_api_id, data) };
if(!_args.empty())
{
for(auto itr : tim::delimit(_args, ","))
{
if(itr.empty()) continue;
auto _bpos = itr.find_first_not_of(' ');
auto _epos = itr.find_last_not_of(' ');
if(_epos > _bpos)
itr = itr.substr(_bpos, (_epos - _bpos) + 1);
auto _pos = itr.find('=');
if(_pos != std::string::npos)
tracing::add_perfetto_annotation(
ctx, itr.substr(0, _pos),
itr.substr(_pos + 1));
}
}
}
if(_enable_backtraces && _bt_data && !_bt_data->empty())
{
const std::string _unk = "??";
size_t _bt_cnt = 0;
for(const auto& itr : *_bt_data)
{
const auto* _func =
(itr.name.empty()) ? &_unk : &itr.name;
const auto* _loc =
(itr.location.empty()) ? &_unk : &itr.location;
auto _line = (itr.lineno == 0) ? std::string{ "?" }
: join("", itr.lineno);
auto _entry = join("", demangle(*_func), " @ ",
join(':', *_loc, _line));
if(_bt_cnt < 10)
{
// Prepend zero for better ordering in UI.
// Only one zero is ever necessary since stack depth
// is limited to 16.
tracing::add_perfetto_annotation(
ctx, join("", "frame#0", _bt_cnt++), _entry);
}
else
{
tracing::add_perfetto_annotation(
ctx, join("", "frame#", _bt_cnt++), _entry);
}
}
}
}
});
}
if(get_use_timemory())
{
auto itr = get_roctracer_hip_data()->emplace(
_roct_cid, roctracer_hip_bundle_t{ op_name });
if(itr.second)
{
itr.first->second.start();
}
else if(itr.first != get_roctracer_hip_data()->end())
{
itr.first->second.stop();
get_roctracer_hip_data()->erase(itr.first);
}
}
hip_exec_activity_callbacks(_tid);
}
else if(data->phase == ACTIVITY_API_PHASE_EXIT)
{
hip_exec_activity_callbacks(_tid);
if(get_use_perfetto())
{
tracing::pop_perfetto_ts(
category::rocm_hip{}, op_name, _ts, [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "end_ns", _ts);
}
});
}
if(get_use_timemory())
{
auto _stop = [&_roct_cid](int64_t _tid_v) {
auto& _data = get_roctracer_hip_data(_tid_v);
auto itr = _data->find(_roct_cid);
if(itr != get_roctracer_hip_data()->end())
{
itr->second.stop();
_data->erase(itr);
return true;
}
return false;
};
if(!_stop(_tid))
{
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
if(_stop(i)) break;
}
}
}
}
tim::consume_parameters(arg);
}
// Activity tracing callback
void
hip_activity_callback(const char* begin, const char* end, void* arg)
{
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
auto&& _protect = comp::roctracer::protect_flush_activity();
(void) _protect;
if(!trait::runtime_enabled<comp::roctracer>::get()) return;
static auto _kernel_names = std::unordered_map<const char*, std::string>{};
static auto _indexes = std::unordered_map<uint64_t, int>{};
static auto _skip_barrier_packets =
config::get_setting_value<bool>("ROCPROFSYS_ROCTRACER_DISCARD_BARRIERS")
.value_or(false);
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
const roctracer_record_t* end_record =
reinterpret_cast<const roctracer_record_t*>(end);
auto&& _advance_record = [&record]() {
ROCPROFSYS_ROCTRACER_CALL(roctracer_next_record(record, &record));
};
while(record < end_record)
{
// make sure every iteration advances regardless of where return point happens
scope::destructor _next_dtor{ _advance_record };
// ROCPROFSYS_CI will enable these asserts and should fail if something relevant
// changes
assert(HIP_OP_ID_DISPATCH == 0);
assert(HIP_OP_ID_COPY == 1);
assert(HIP_OP_ID_BARRIER == 2);
if(record->domain == ACTIVITY_DOMAIN_HSA_OPS)
{
hsa_activity_callback(record->op, record, arg);
continue;
}
if(record->domain != ACTIVITY_DOMAIN_HIP_OPS) continue;
if(record->op > HIP_OP_ID_BARRIER) continue;
if(_skip_barrier_packets && record->op == HIP_OP_ID_BARRIER) continue;
const char* op_name =
roctracer_op_string(record->domain, record->op, record->kind);
auto _ns_skew = get_clock_skew();
uint64_t _beg_ns = record->begin_ns + _ns_skew;
uint64_t _end_ns = record->end_ns + _ns_skew;
auto _roct_cid = record->correlation_id;
auto& _keys = get_roctracer_key_data();
auto& _tids = get_roctracer_tid_data();
int64_t _tid = 0; // thread id
int32_t _devid = record->device_id; // device id
int64_t _queid = record->queue_id; // queue id
uintptr_t _queue = 0; // Host queue (stream)
const char* _name = nullptr;
bool _found = false;
{
locking::atomic_lock _lk{ roctracer_type_mutex<key_data_mutex_t>() };
if(_tids.find(_roct_cid) != _tids.end())
{
_found = true;
_tid = _tids.at(_roct_cid);
auto itr = _keys.find(_roct_cid);
if(itr != _keys.end()) _name = itr->second;
}
}
if(_name == nullptr && op_name == nullptr) continue;
if(_name == nullptr) _name = op_name;
static auto _op_id_names =
std::array<const char*, 3>{ "DISPATCH", "COPY", "BARRIER" };
if(_end_ns < _beg_ns)
{
auto _verbose = []() { return get_verbose() >= 0 || get_debug(); };
static size_t _n = 0;
static size_t _nmax =
get_env<size_t>("ROCPROFSYS_ROCTRACER_DISCARD_INVALID", 0);
if(_nmax == 0) std::swap(_end_ns, _beg_ns);
ROCPROFSYS_WARNING_IF_F(
_n < _nmax && _verbose(),
"%4zu :: Discarding kernel roctracer activity record which ended before "
"it started :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) "
"delta=%li, device=%d, queue=%lu, pid=%u, tid=%lu, op=%s\n",
_n, op_name, _name, record->correlation_id, _beg_ns, _end_ns,
(static_cast<int64_t>(_end_ns) - static_cast<int64_t>(_beg_ns)), _devid,
_queid, record->process_id, _tid, _op_id_names.at(record->op));
ROCPROFSYS_WARNING_IF_F(
_nmax > 0 && _n == _nmax && _verbose(),
"Suppressing future messages about discarding kernel roctracer activity "
"record which ended before it started. Set "
"ROCPROFSYS_ROCTRACER_DISCARD_INVALID=N to increase/decrease the number "
"of messages. If N is set to 0, data will be included after swapping the "
"begin and end values\n");
if(_end_ns < _beg_ns)
{
++_n;
continue;
}
}
// execute this on this thread bc of how perfetto visualization works
if(get_use_perfetto())
{
if(_kernel_names.find(_name) == _kernel_names.end())
_kernel_names.emplace(_name, tim::demangle(_name));
auto _track_desc = [](int32_t _device_id, int64_t _queue_id) {
if(config::get_perfetto_roctracer_per_stream())
return JOIN("", "HIP Activity Device ", _device_id, ", Queue ",
_queue_id);
return JOIN("", "HIP Activity Device ", _device_id);
};
const auto _track = tracing::get_perfetto_track(
category::device_hip{}, _track_desc, _devid,
(get_perfetto_roctracer_per_stream()) ? _queid : 0);
assert(_end_ns >= _beg_ns);
tracing::push_perfetto_track(
category::device_hip{}, _kernel_names.at(_name).c_str(), _track, _beg_ns,
::perfetto::Flow::ProcessScoped(_roct_cid),
[&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid);
tracing::add_perfetto_annotation(ctx, "device", _devid);
tracing::add_perfetto_annotation(ctx, "queue", _queid);
tracing::add_perfetto_annotation(ctx, "tid", _tid);
tracing::add_perfetto_annotation(
ctx, "stream", JOIN("", "0x", std::hex, _queue));
tracing::add_perfetto_annotation(ctx, "op",
_op_id_names.at(record->op));
}
});
tracing::pop_perfetto_track(category::device_hip{}, "", _track, _end_ns);
}
if(_found && _name != nullptr && get_use_timemory())
{
auto _func = [_beg_ns, _end_ns, _name]() {
roctracer_hip_bundle_t _bundle{ _name };
_bundle.start()
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
.stop()
.get<comp::wall_clock>([&](comp::wall_clock* wc) {
wc->set_value(_end_ns - _beg_ns);
wc->set_accum(_end_ns - _beg_ns);
return wc;
});
_bundle.pop();
};
auto& _async_ops = get_hip_activity_callbacks(_tid);
locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) };
_async_ops->emplace_back(std::move(_func));
}
}
// ensures that all the updates are written
if(get_use_perfetto()) ::perfetto::TrackEvent::Flush();
}
bool&
roctracer_is_init()
{
static bool _v = tim::get_env("ROCPROFSYS_ROCTRACER_IS_INIT", false);
return _v;
}
bool&
roctracer_is_setup()
{
static bool _v = false;
return _v;
}
using roctracer_functions_t = std::vector<std::pair<std::string, std::function<void()>>>;
roctracer_functions_t&
roctracer_setup_routines()
{
static auto _v = roctracer_functions_t{};
return _v;
}
roctracer_functions_t&
roctracer_shutdown_routines()
{
static auto _v = roctracer_functions_t{};
return _v;
}
} // namespace rocprofsys
-89
ファイルの表示
@@ -1,89 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/hip_runtime.hpp"
#include "core/perfetto.hpp"
#include "library/components/roctracer.hpp"
#include "library/ptl.hpp"
#include <iostream>
#include <memory>
// Macro to check ROC-tracer calls status
#define ROCPROFSYS_ROCTRACER_CALL(call) \
{ \
ROCPROFSYS_DEBUG_F(#call); \
int err = call; \
if(err != 0) \
{ \
ROCPROFSYS_PRINT_F("%s in: %s\n", roctracer_error_string(), #call); \
} \
}
namespace rocprofsys
{
using roctracer_hip_bundle_t =
tim::component_bundle<category::rocm_hip, comp::roctracer_data, comp::wall_clock>;
using roctracer_hsa_bundle_t =
tim::component_bundle<category::rocm_hsa, comp::roctracer_data>;
using roctracer_functions_t = std::vector<std::pair<std::string, std::function<void()>>>;
// HSA API callback function
void
hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
void
hsa_activity_callback(uint32_t op, const void* record, void* arg);
void
hip_exec_activity_callbacks(int64_t _tid);
// HIP API callback function
void
hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
void
roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
// Activity tracing callback
void
hip_activity_callback(const char* begin, const char* end, void*);
bool&
roctracer_is_init();
bool&
roctracer_is_setup();
int64_t
get_clock_skew();
roctracer_functions_t&
roctracer_setup_routines();
roctracer_functions_t&
roctracer_shutdown_routines();
} // namespace rocprofsys
-1
ファイルの表示
@@ -33,7 +33,6 @@
#include "library/components/mpi_gotcha.hpp"
#include "library/components/numa_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/roctracer.hpp"
#include "library/thread_data.hpp"
#include <timemory/backends/threading.hpp>
+11 -25
ファイルの表示
@@ -4,9 +4,7 @@
#
# -------------------------------------------------------------------------------------- #
set(ROCPROFSYS_ROCM_EVENTS_TEST
"GRBM_COUNT,GPUBusy,SQ_WAVES,SQ_INSTS_VALU,VALUInsts,TCC_HIT_sum,TA_TA_BUSY[0]:device=0,TA_TA_BUSY[11]:device=0"
)
set(ROCPROFSYS_ROCM_EVENTS_TEST "GRBM_COUNT,SQ_WAVES,SQ_INSTS_VALU,TA_TA_BUSY:device=0")
rocprofiler_systems_add_test(
NAME transpose
@@ -26,7 +24,8 @@ rocprofiler_systems_add_test(
args
-E
uniform_int_distribution
ENVIRONMENT "${_base_environment}")
ENVIRONMENT "${_base_environment}"
RUNTIME_TIMEOUT 480)
rocprofiler_systems_add_test(
SKIP_REWRITE SKIP_RUNTIME
@@ -36,9 +35,7 @@ rocprofiler_systems_add_test(
GPU ON
NUM_PROCS 1
RUN_ARGS 1 2 2
ENVIRONMENT
"${_base_environment};ROCPROFSYS_ROCTRACER_HSA_ACTIVITY=OFF;ROCPROFSYS_ROCTRACER_HSA_API=OFF"
)
ENVIRONMENT "${_base_environment}")
rocprofiler_systems_add_test(
SKIP_BASELINE SKIP_RUNTIME
@@ -64,7 +61,11 @@ rocprofiler_systems_add_test(
ENVIRONMENT "${_base_environment}"
REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose")
if(ROCPROFSYS_USE_ROCPROFILER)
if(ROCPROFSYS_USE_ROCM)
set(_ROCP_PASS_REGEX
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-TA_TA_BUSY.txt(.*)"
)
rocprofiler_systems_add_test(
SKIP_BASELINE SKIP_RUNTIME
NAME transpose-rocprofiler
@@ -76,22 +77,7 @@ if(ROCPROFSYS_USE_ROCPROFILER)
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
ENVIRONMENT
"${_base_environment};ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}"
REWRITE_RUN_PASS_REGEX
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
)
REWRITE_RUN_PASS_REGEX "${_ROCP_PASS_REGEX}"
SAMPLING_PASS_REGEX "${_ROCP_PASS_REGEX}")
rocprofiler_systems_add_test(
SKIP_BASELINE SKIP_RUNTIME
NAME transpose-rocprofiler-no-roctracer
TARGET transpose
LABELS "rocprofiler"
MPI ${TRANSPOSE_USE_MPI}
GPU ON
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
ENVIRONMENT
"${_base_environment};ROCPROFSYS_USE_ROCTRACER=OFF;ROCPROFSYS_ROCM_EVENTS=${ROCPROFSYS_ROCM_EVENTS_TEST}"
REWRITE_RUN_PASS_REGEX
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
REWRITE_RUN_FAIL_REGEX "roctracer.txt|ROCPROFSYS_ABORT_FAIL_REGEX")
endif()
+8 -15
ファイルの表示
@@ -226,7 +226,7 @@ endif()
# -------------------------------------------------------------------------------------- #
set(_VALID_GPU OFF)
if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
if(ROCPROFSYS_USE_ROCM AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
set(_VALID_GPU ON)
find_program(
ROCPROFSYS_ROCM_SMI_EXE
@@ -254,7 +254,7 @@ if(ROCPROFSYS_USE_HIP AND (NOT DEFINED ROCPROFSYS_CI_GPU OR ROCPROFSYS_CI_GPU))
endif()
endif()
set(LULESH_USE_GPU ${LULESH_USE_HIP})
set(LULESH_USE_GPU ${LULESH_USE_ROCM})
if(LULESH_USE_CUDA)
set(LULESH_USE_GPU ON)
endif()
@@ -314,8 +314,6 @@ ROCPROFSYS_SAMPLING_FREQ = 300
ROCPROFSYS_SAMPLING_DELAY = 0.05
ROCPROFSYS_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS}
ROCPROFSYS_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES
ROCPROFSYS_ROCTRACER_HSA_API = ON
ROCPROFSYS_ROCTRACER_HSA_ACTIVITY = ON
# test-specific values
${_FILE_CONTENTS}
@@ -430,18 +428,18 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
if(TEST_GPU)
list(APPEND TEST_LABELS "gpu")
if(NOT "ROCPROFSYS_USE_ROCTRACER=OFF" IN_LIST TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "roctracer")
if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "rocm")
endif()
if(NOT "ROCPROFSYS_USE_ROCM_SMI=OFF" IN_LIST TEST_ENVIRONMENT)
if(NOT "ROCPROFSYS_USE_ROCM=OFF" IN_LIST TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "rocm-smi")
endif()
endif()
if("ROCPROFSYS_USE_ROCTRACER=ON" IN_LIST TEST_ENVIRONMENT AND NOT "roctracer" IN_LIST
TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "roctracer")
if("ROCPROFSYS_USE_ROCM=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm" IN_LIST
TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "rocm")
endif()
if("ROCPROFSYS_USE_ROCM_SMI=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm-smi" IN_LIST
@@ -449,11 +447,6 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
list(APPEND TEST_LABELS "rocm-smi")
endif()
if("ROCPROFSYS_USE_ROCPROFILER=ON" IN_LIST TEST_ENVIRONMENT
AND NOT "rocprofiler" IN_LIST TEST_ENVIRONMENT)
list(APPEND TEST_LABELS "rocprofiler")
endif()
if(TARGET ${TEST_TARGET})
if(DEFINED TEST_MPI
AND ${TEST_MPI}