RCCL support (#93)

* Initial support for RCCL

* OMNITRACE_USE_RCCLP + sampling tweaks

- also OMNITRACE_SAMPLING_KEEP_INTERNAL option
- minor modifications to sampling to use keep internal option + discard funlockfile

* Update docker and workflows to download RCCL

* Update CPack DEB with rocprofiler dependency

* Rework rccl into library and library/components folder

- add tpls/rccl/rccl/rccl.h

* Fix timemory includes

* rcclp inline definitions when disabled

* Tweaks to ubuntu-focal-external-rocm

- disable ompt
- enable building testing

* Tweaks to ubuntu-focal-external-rocm

- ctest exclude

* Tweak ubuntu-focal.yml

- remove source /.../setup-env.sh, replace with $GITHUB_ENV

* Fix ubuntu-focal-rocm + OMPI + root

* Improved rocm-smi error handling

- Recover from rocm-smi errors
- Disabling rocm-smi after recovering from errors
- Werror in developer mode
- Remove State::DelayedInit
- Add State::Disabled

* formatting

* Fix merge of OMNITRACE_SAMPLING_KEEP_INTERNAL

* Update RCCL include directory

- based on ROCm version we need with <rccl/rccl.h> or <rccl.h>

* RCCL Testing

- updated tests to use configuration files
- many tests generate a configuration file
- tests how have GPU option
- enable ncclCommCount, disable ncclGetVersion
- add testing for RCCLP via rccl-tests
- working directory of tests is PROJECT_BINARY_DIR
- add nccl/rccl functions to get_whole_function_names
- some clang compiler fixes

* Handle RCCL include w/o HIP

* RCCL requires HIP

* Update OMNITRACE_SAMPLING_CPUS for testing

* Update tests/CMakeLists.txt

* Debug settings

* Install MPI even when USE_MPI=OFF

* exclude printf

* skip mpi tests w/o USE_MPI or USE_MPI_HEADERS

* update ubuntu rocm workflow

* Fix configure env step for ubuntu rocm

[ROCm/rocprofiler-systems commit: 45be03906a]
这个提交包含在:
Jonathan R. Madsen
2022-07-25 12:16:11 -05:00
提交者 GitHub
父节点 8228dcef50
当前提交 93343034bc
修改 29 个文件,包含 1813 行新增120 行删除
@@ -28,6 +28,7 @@ parse:
NAME: '*'
TARGET: '*'
MPI: '*'
GPU: '*'
NUM_PROCS: '*'
REWRITE_TIMEOUT: '*'
RUNTIME_TIMEOUT: '*'
+1
查看文件
@@ -121,5 +121,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
+1
查看文件
@@ -150,5 +150,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
+97 -22
查看文件
@@ -66,7 +66,7 @@ jobs:
add-apt-repository -y ppa:ubuntu-toolchain-r/test &&
apt-get update &&
apt-get upgrade -y &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy &&
python3 -m pip install perfetto &&
@@ -166,6 +166,7 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
@@ -176,12 +177,23 @@ jobs:
strategy:
matrix:
compiler: ['g++']
rocm_version: ['4.3', '4.5', 'debian']
rocm_version: ['4.3', '4.5', '5.0']
mpi_headers: ['OFF']
build_jobs: ['4']
ctest_exclude: ['-LE "mpi-example|transpose"']
perfetto-tools: ['ON']
include:
- compiler: 'g++'
rocm_version: 'debian'
mpi_headers: 'ON'
build_jobs: '2'
ctest_exclude: '-LE transpose'
perfetto-tools: 'OFF'
env:
BUILD_TYPE: MinSizeRel
OMNITRACE_OUTPUT_PATH: omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX: "%argt%/"
OMPI_ALLOW_RUN_AS_ROOT: 1
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
steps:
- uses: actions/checkout@v2
@@ -194,18 +206,41 @@ jobs:
wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list &&
apt-get update &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev &&
python3 -m pip install --upgrade pip &&
python3 -m pip install 'cmake==3.16.3' &&
for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install 'cmake==3.21.4' &&
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
- name: Install RCCL
if: ${{ matrix.rocm_version != '4.3' }}
timeout-minutes: 5
run:
apt-get install -y rccl-dev
- name: Configure Env
run:
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV &&
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV &&
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV &&
echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH &&
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
run: |
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
cat << EOF > test-install.cfg
OMNITRACE_USE_TIMEMORY = ON
OMNITRACE_USE_PERFETTO = ON
OMNITRACE_USE_PID = OFF
OMNITRACE_USE_SAMPLING = OFF
OMNITRACE_USE_PROCESS_SAMPLING = OFF
OMNITRACE_COUT_OUTPUT = ON
OMNITRACE_TIME_OUTPUT = OFF
OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char
OMNITRACE_OUTPUT_PATH = omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX = %tag%/
OMNITRACE_DEBUG = OFF
OMNITRACE_VERBOSE = 3
OMNITRACE_DL_VERBOSE = 3
OMNITRACE_PERFETTO_BACKEND = system
EOF
realpath test-install.cfg
cat test-install.cfg
- name: Configure CMake
timeout-minutes: 10
@@ -217,22 +252,27 @@ jobs:
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
-DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }}
-DCMAKE_INSTALL_PREFIX=/opt/omnitrace
-DOMNITRACE_BUILD_TESTING=OFF
-DOMNITRACE_BUILD_TESTING=ON
-DOMNITRACE_BUILD_DEVELOPER=ON
-DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF
-DOMNITRACE_BUILD_LTO=OFF
-DOMNITRACE_USE_MPI=OFF
-DOMNITRACE_USE_MPI_HEADERS=ON
-DOMNITRACE_USE_HIP=ON
-DOMNITRACE_MAX_THREADS=32
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_USE_PAPI=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON
-DOMNITRACE_USE_OMPT=OFF
-DOMNITRACE_USE_PYTHON=ON
-DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }}
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }}
-DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs
-DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10"
-DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }}
- name: Build
timeout-minutes: 60
run:
cmake --build build --target all --parallel 2 -- VERBOSE=1
cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1
- name: Install
run:
@@ -244,20 +284,28 @@ jobs:
cd build &&
ldd ./bin/omnitrace &&
./bin/omnitrace --help &&
ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
- name: Configure Install Env
run: |
echo "/opt/omnitrace/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV
- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'ON' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
omnitrace-perfetto-traced --background
export OMNITRACE_DEBUG=ON
export OMNITRACE_PERFETTO_BACKEND=system
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
@@ -272,6 +320,31 @@ jobs:
du -m ls-perfetto-trace.proto
/opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto
- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'OFF' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
ldd $(which omnitrace)
omnitrace --help
omnitrace -e -v 1 -o sleep.inst --simulate -- sleep
omnitrace -e -v 1 --simulate -- sleep
omnitrace -e -v 1 -o sleep.inst -- sleep
./sleep.inst 5
omnitrace -e -v 1 -- sleep 5
cat omnitrace-tests-output/sleep.inst/wall_clock.txt
cat omnitrace-tests-output/sleep/wall_clock.txt
- name: Test User API
timeout-minutes: 10
run: |
@@ -293,6 +366,7 @@ jobs:
name: data-${{ github.job }}-files
path: |
omnitrace-tests-output/**/*.txt
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
@@ -445,5 +519,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
+7 -2
查看文件
@@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support"
omnitrace_add_option(
OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON)
@@ -175,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP)
set(OMNITRACE_USE_ROCM_SMI
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
set(OMNITRACE_USE_RCCL
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
elseif(
OMNITRACE_USE_HIP
AND NOT OMNITRACE_USE_ROCTRACER
AND NOT OMNITRACE_USE_ROCPROFILER
AND NOT OMNITRACE_USE_ROCM_SMI)
AND NOT OMNITRACE_USE_ROCM_SMI
AND NOT OMNITRACE_USE_RCCL)
omnitrace_message(
AUTHOR_WARNING
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled"
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
)
set(OMNITRACE_USE_HIP OFF)
endif()
@@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST)
endif()
endif()
if(ROCmVersion_FOUND)
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCM_SMI_SUFFIX
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
@@ -167,6 +168,9 @@ endif()
if(OMNITRACE_USE_ROCTRACER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
endif()
if(OMNITRACE_USE_ROCPROFILER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
endif()
if(OMNITRACE_USE_MPI)
if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi")
list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev")
@@ -0,0 +1,94 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
# Copyright.txt or https://cmake.org/licensing for details.
include(FindPackageHandleStandardArgs)
# ----------------------------------------------------------------------------------------#
set(RCCL-Headers_INCLUDE_DIR_INTERNAL
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl"
CACHE PATH "Path to internal rccl.h")
# ----------------------------------------------------------------------------------------#
if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()
foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
if(EXISTS ${_DIR})
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
list(APPEND _RCCL_PATHS ${_ABS_DIR})
endif()
endforeach()
# ----------------------------------------------------------------------------------------#
find_package(
rccl
QUIET
CONFIG
HINTS
${_RCCL_PATHS}
PATHS
${_RCCL_PATHS}
PATH_SUFFIXES
rccl/lib/cmake)
if(NOT rccl_FOUND)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
CACHE PATH "Path to RCCL headers")
else()
set(RCCL-Headers_INCLUDE_DIR
"${rccl_INCLUDE_DIR}"
CACHE PATH "Path to RCCL headers")
endif()
# because of the annoying warning starting with v5.2.0, we've got to do this crap
if(ROCmVersion_NUMERIC_VERSION)
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
set(_RCCL-Headers_FILE "rccl.h")
set(_RCCL-Headers_DIR "/rccl")
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
omnitrace_message(
AUTHOR_WARNING
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
CACHE PATH "Path to RCCL headers" FORCE)
endif()
unset(_RCCL-Headers_FILE)
unset(_RCCL-Headers_DIR)
mark_as_advanced(RCCL-Headers_INCLUDE_DIR)
# ----------------------------------------------------------------------------------------#
find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)
# ------------------------------------------------------------------------------#
if(RCCL-Headers_FOUND)
add_library(roc::rccl-headers INTERFACE IMPORTED)
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})
target_include_directories(roc::rccl-headers SYSTEM
INTERFACE ${RCCL-Headers_INCLUDE_DIR})
add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
endif()
# ------------------------------------------------------------------------------#
@@ -20,6 +20,8 @@ omnitrace_add_interface_library(omnitrace-rocprofiler
"Provides flags and libraries for rocprofiler")
omnitrace_add_interface_library(omnitrace-rocm-smi
"Provides flags and libraries for rocm-smi")
omnitrace_add_interface_library(
omnitrace-rccl "Provides flags for ROCm Communication Collectives Library (RCCL)")
omnitrace_add_interface_library(omnitrace-mpi "Provides MPI or MPI headers")
omnitrace_add_interface_library(omnitrace-ptl "Enables PTL support (tasking)")
omnitrace_add_interface_library(omnitrace-papi "Enable PAPI support")
@@ -37,6 +39,7 @@ set(OMNITRACE_EXTENSION_LIBRARIES
omnitrace::omnitrace-roctracer
omnitrace::omnitrace-rocprofiler
omnitrace::omnitrace-rocm-smi
omnitrace::omnitrace-rccl
omnitrace::omnitrace-mpi
omnitrace::omnitrace-ptl
omnitrace::omnitrace-ompt
@@ -196,6 +199,17 @@ if(OMNITRACE_USE_ROCM_SMI)
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${rocm-smi_LIBRARY_DIRS}")
endif()
# ----------------------------------------------------------------------------------------#
#
# RCCL
#
# ----------------------------------------------------------------------------------------#
if(OMNITRACE_USE_RCCL)
find_package(RCCL-Headers ${omnitrace_FIND_QUIETLY} REQUIRED)
target_link_libraries(omnitrace-rccl INTERFACE roc::rccl-headers)
omnitrace_target_compile_definitions(omnitrace-rccl INTERFACE OMNITRACE_USE_RCCL)
endif()
# ----------------------------------------------------------------------------------------#
#
# MPI
@@ -24,7 +24,7 @@ ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm
RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \
yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
yum install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \
yum update -y && \
yum clean all
@@ -25,7 +25,7 @@ RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${A
zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \
zypper --non-interactive --gpg-auto-import-keys refresh && \
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \
zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
zypper install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \
zypper clean --all
ARG PYTHON_VERSIONS="6 7 8 9 10"
@@ -28,7 +28,7 @@ RUN apt-get update && \
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_REPO_VERSION}/ ${ROCM_REPO_DIST} main" | tee /etc/apt/sources.list.d/rocm.list && \
apt-get update && \
apt-get dist-upgrade -y && \
apt-get install -y rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \
apt-get install -y rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \
apt-get autoclean
RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
@@ -32,3 +32,4 @@ add_subdirectory(openmp)
add_subdirectory(mpi)
add_subdirectory(python)
add_subdirectory(lulesh)
add_subdirectory(rccl)
@@ -0,0 +1,61 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
project(omnitrace-rccl-example LANGUAGES CXX)
find_package(rccl)
find_package(hip HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR})
function(rccl_message)
if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace")
omnitrace_message(${ARGN})
else()
message(${ARGN})
endif()
endfunction()
if(hip_FOUND AND rccl_FOUND)
include(FetchContent)
fetchcontent_declare(
rccl-tests GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rccl-tests.git)
# After the following call, the CMake targets defined by googletest and Catch2 will be
# available to the rest of the build
fetchcontent_makeavailable(rccl-tests)
get_filename_component(rccl_ROOT_DIR "${rccl_INCLUDE_DIR}" DIRECTORY)
rccl_message(STATUS "Building rccl-tests...")
execute_process(
COMMAND make HIP_HOME=${ROCM_PATH} RCCL_HOME=${rccl_ROOT_DIR}
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src
RESULT_VARIABLE _RCCL_BUILD_RET
ERROR_VARIABLE _RCCL_BUILD_ERR
OUTPUT_VARIABLE _RCCL_BUILD_OUT
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE)
if(NOT _RCCL_BUILD_RET EQUAL 0)
rccl_message(${_RCCL_BUILD_OUT})
rccl_message(AUTHOR_WARNING "Failed to build rccl-tests: ${_RCCL_BUILD_ERR}")
else()
file(GLOB RCCL_TEST_EXECUTABLES
${CMAKE_BINARY_DIR}/_deps/rccl-tests-src/build/*_perf)
set(_RCCL_TEST_TARGETS)
foreach(_EXE ${RCCL_TEST_EXECUTABLES})
get_filename_component(_EXE_NAME "${_EXE}" NAME)
execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${_EXE}
${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME})
add_executable(rccl-tests::${_EXE_NAME} IMPORTED GLOBAL)
set_property(
TARGET rccl-tests::${_EXE_NAME}
PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME})
list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}")
endforeach()
set(RCCL_TEST_TARGETS
"${_RCCL_TEST_TARGETS}"
CACHE INTERNAL "rccl-test targets")
endif()
else()
rccl_message(AUTHOR_WARNING "${PROJECT_NAME} skipped. Missing RCCL and/or HIP...")
endif()
@@ -40,13 +40,16 @@ endif()
add_executable(transpose transpose.cpp)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang"
AND NOT CMAKE_CXX_COMPILER_IS_HIPCC
AND NOT HIPCC_EXECUTABLE)
target_link_libraries(
transpose
PRIVATE
$<IF:$<TARGET_EXISTS:omnitrace::omnitrace-compile-options>,omnitrace::omnitrace-compile-options,>
$<IF:$<TARGET_EXISTS:hip::host>,hip::host,>
$<IF:$<TARGET_EXISTS:hip::device>,hip::device,>)
PRIVATE $<TARGET_NAME_IF_EXISTS:omnitrace::omnitrace-compile-options>
$<TARGET_NAME_IF_EXISTS:hip::host> $<TARGET_NAME_IF_EXISTS:hip::device>)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
target_link_libraries(
transpose PRIVATE $<TARGET_NAME_IF_EXISTS:omnitrace::omnitrace-compile-options>)
else()
target_compile_options(transpose PRIVATE -W -Wall)
endif()
@@ -52,7 +52,13 @@ get_whole_function_names()
"rocr::core::BusyWaitSignal::WaitAcquire",
"rocr::core::BusyWaitSignal::WaitRelaxed", "rocr::HSA::hsa_signal_wait_scacquire",
"rocr::os::ThreadTrampoline", "rocr::image::ImageRuntime::CreateImageManager",
"rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", "event_base_loop"
"rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info",
"event_base_loop", "bootstrapRoot", "bootstrapNetAccept", "ncclCommInitRank",
"ncclCommInitAll", "ncclCommDestroy", "ncclCommCount", "ncclCommCuDevice",
"ncclCommUserRank", "ncclReduce", "ncclBcast", "ncclBroadcast", "ncclAllReduce",
"ncclReduceScatter", "ncclAllGather", "ncclGroupStart", "ncclGroupEnd",
"ncclSend", "ncclRecv", "ncclGather", "ncclScatter", "ncclAllToAll",
"ncclAllToAllv"
};
#else
// should hopefully be removed soon
@@ -429,6 +429,7 @@ module_function::is_routine_constrained() const
static std::regex exclude(
"(omnitrace|tim::|N3tim|MPI_Init|MPI_Finalize|dyninst|tm_clones)", regex_opts);
static std::regex exclude_printf("(|v|f)printf$", regex_opts);
static std::regex exclude_cxx(
"(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|"
"S)_|::basic_string[a-zA-Z,<>: ]+::_M_create|::__|::_(Alloc|State)|"
@@ -456,6 +457,11 @@ module_function::is_routine_constrained() const
return _report("Excluding", "critical", 3);
}
if(std::regex_search(function_name, exclude_printf))
{
return _report("Excluding", "critical-printf", 3);
}
if(whole.count(function_name) > 0)
{
return _report("Excluding", "critical-whole-match", 3);
@@ -34,6 +34,7 @@ target_link_libraries(
$<BUILD_INTERFACE:omnitrace::omnitrace-roctracer>
$<BUILD_INTERFACE:omnitrace::omnitrace-rocprofiler>
$<BUILD_INTERFACE:omnitrace::omnitrace-rocm-smi>
$<BUILD_INTERFACE:omnitrace::omnitrace-rccl>
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_LTO}>,omnitrace::omnitrace-lto,>>
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBGCC}>,omnitrace::omnitrace-static-libgcc,>>
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBSTDCXX}>,omnitrace::omnitrace-static-libstdcxx,>>
@@ -100,6 +101,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp
${CMAKE_CURRENT_LIST_DIR}/library/process_sampler.hpp
${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp
${CMAKE_CURRENT_LIST_DIR}/library/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/library/rocm.hpp
${CMAKE_CURRENT_LIST_DIR}/library/rocprofiler.hpp
${CMAKE_CURRENT_LIST_DIR}/library/roctracer.hpp
@@ -118,6 +120,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/rocprofiler.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp
@@ -143,6 +146,13 @@ if(OMNITRACE_USE_ROCTRACER)
${CMAKE_CURRENT_LIST_DIR}/library/roctracer.cpp)
endif()
if(OMNITRACE_USE_RCCL)
target_sources(
omnitrace-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.cpp
${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp)
endif()
if(OMNITRACE_USE_ROCPROFILER)
target_sources(
omnitrace-object-library
@@ -41,6 +41,7 @@
#include "library/ompt.hpp"
#include "library/process_sampler.hpp"
#include "library/ptl.hpp"
#include "library/rcclp.hpp"
#include "library/rocprofiler.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
@@ -647,6 +648,12 @@ omnitrace_init_tooling_hidden()
ompt::setup();
}
if(get_use_rcclp())
{
OMNITRACE_VERBOSE_F(1, "Setting up RCCLP...\n");
rcclp::setup();
}
if(get_use_perfetto() && !is_system_backend())
{
#if defined(CUSTOM_DATA_SOURCE)
@@ -840,6 +847,12 @@ omnitrace_finalize_hidden(void)
}
}
if(get_use_rcclp())
{
OMNITRACE_VERBOSE_F(1, "Shutting down RCCLP...\n");
rcclp::shutdown();
}
if(get_use_ompt())
{
OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n");
@@ -47,6 +47,7 @@
TIMEMORY_DEFINE_NS_API(api, omnitrace)
TIMEMORY_DEFINE_NS_API(api, sampling)
TIMEMORY_DEFINE_NS_API(api, rocm_smi)
TIMEMORY_DEFINE_NS_API(api, rccl)
namespace omnitrace
{
@@ -22,6 +22,7 @@
#pragma once
#include "library/common.hpp"
#include "library/defines.hpp"
#include <timemory/api.hpp>
@@ -32,6 +33,7 @@
#include <timemory/enum.h>
#include <timemory/mpl/concepts.hpp>
#include <timemory/mpl/type_traits.hpp>
#include <timemory/mpl/types.hpp>
#include <type_traits>
@@ -40,6 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling)
TIMEMORY_DECLARE_COMPONENT(roctracer)
TIMEMORY_DECLARE_COMPONENT(rocprofiler)
TIMEMORY_DECLARE_COMPONENT(rccl_comm_data)
TIMEMORY_DECLARE_COMPONENT(rcclp_handle)
TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl)
TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker<float, rccl_api_t>)
/// \struct tim::trait::name
/// \brief provides a constexpr string in ::value
@@ -74,6 +80,7 @@ TIMEMORY_DEFINE_NS_API(category, pthread)
TIMEMORY_DEFINE_NS_API(category, kokkos)
TIMEMORY_DEFINE_NS_API(category, mpi)
TIMEMORY_DEFINE_NS_API(category, ompt)
TIMEMORY_DEFINE_NS_API(category, rccl)
TIMEMORY_DEFINE_NS_API(category, critical_trace)
TIMEMORY_DEFINE_NS_API(category, host_critical_trace)
TIMEMORY_DEFINE_NS_API(category, device_critical_trace)
@@ -93,6 +100,7 @@ TIMEMORY_DEFINE_NAME_TRAIT("pthread", category::pthread);
TIMEMORY_DEFINE_NAME_TRAIT("kokkos", category::kokkos);
TIMEMORY_DEFINE_NAME_TRAIT("mpi", category::mpi);
TIMEMORY_DEFINE_NAME_TRAIT("ompt", category::ompt);
TIMEMORY_DEFINE_NAME_TRAIT("rccl", category::rccl);
TIMEMORY_DEFINE_NAME_TRAIT("critical-trace", category::critical_trace);
TIMEMORY_DEFINE_NAME_TRAIT("host-critical-trace", category::host_critical_trace);
TIMEMORY_DEFINE_NAME_TRAIT("device-critical-trace", category::device_critical_trace);
@@ -150,6 +158,13 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
#endif
#if !defined(OMNITRACE_USE_RCCL)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
#endif
#if !defined(TIMEMORY_USE_LIBUNWIND)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type)
@@ -0,0 +1,269 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/rcclp.hpp"
#include "library/rcclp.hpp"
#include <timemory/manager.hpp>
std::ostream&
operator<<(std::ostream& _os, const ncclUniqueId& _v)
{
for(auto itr : _v.internal)
_os << itr;
return _os;
}
namespace tim
{
namespace component
{
uint64_t
activate_rcclp()
{
using handle_t = tim::component::rcclp_handle;
static auto _handle = std::shared_ptr<handle_t>{};
if(!_handle.get())
{
_handle = std::make_shared<handle_t>();
_handle->start();
auto cleanup_functor = [=]() {
if(_handle)
{
_handle->stop();
_handle.reset();
}
};
std::stringstream ss;
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
<< demangle<api::rccl>();
tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor);
return 1;
}
return 0;
}
//
//======================================================================================//
//
uint64_t
deactivate_rcclp(uint64_t id)
{
if(id > 0)
{
std::stringstream ss;
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
<< demangle<api::rccl>();
tim::manager::instance()->cleanup(ss.str());
return 0;
}
return 1;
}
//
//======================================================================================//
//
void
configure_rcclp(const std::set<std::string>& permit, const std::set<std::string>& reject)
{
static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS;
using rcclp_gotcha_t =
tim::component::gotcha<rcclp_wrapper_count, rccl_toolset_t, api::rccl>;
static bool is_initialized = false;
if(!is_initialized)
{
// generate the gotcha wrappers
rcclp_gotcha_t::get_initializer() = []() {
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy);
};
// provide environment variable for suppressing wrappers
rcclp_gotcha_t::get_reject_list() = [reject]() {
auto _reject = reject;
// check environment
auto reject_list =
tim::get_env<std::string>("OMNITRACE_RCCLP_REJECT_LIST", "");
// add environment setting
for(const auto& itr : tim::delimit(reject_list))
_reject.insert(itr);
return _reject;
};
// provide environment variable for selecting wrappers
rcclp_gotcha_t::get_permit_list() = [permit]() {
auto _permit = permit;
// check environment
auto permit_list =
tim::get_env<std::string>("OMNITRACE_RCCLP_PERMIT_LIST", "");
// add environment setting
for(const auto& itr : tim::delimit(permit_list))
_permit.insert(itr);
return _permit;
};
is_initialized = true;
}
}
void
rcclp_handle::start()
{
if(get_tool_count()++ == 0)
{
get_tool_instance() = std::make_shared<rcclp_tuple_t>("timemory_rcclp");
get_tool_instance()->start();
}
}
void
rcclp_handle::stop()
{
auto idx = --get_tool_count();
if(get_tool_instance().get())
{
get_tool_instance()->stop();
if(idx == 0) get_tool_instance().reset();
}
}
rcclp_handle::persistent_data&
rcclp_handle::get_persistent_data()
{
static persistent_data _instance;
return _instance;
}
std::atomic<short>&
rcclp_handle::get_configured()
{
return get_persistent_data().m_configured;
}
rcclp_handle::toolset_ptr_t&
rcclp_handle::get_tool_instance()
{
return get_persistent_data().m_tool;
}
std::atomic<int64_t>&
rcclp_handle::get_tool_count()
{
return get_persistent_data().m_count;
}
void
rccl_comm_data::preinit()
{
omnitrace::rcclp::configure();
}
// ncclReduce
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclSend
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer));
}
// ncclBcast
// ncclRecv
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclBroadcast
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclAllReduce
// ncclReduceScatter
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size);
}
// ncclAllGather
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size);
}
} // namespace component
} // namespace tim
TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t)
@@ -0,0 +1,220 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/common.hpp"
#include "library/components/category_region.hpp"
#include "library/components/fwd.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
#include <timemory/api/macros.hpp>
#include <timemory/components/macros.hpp>
#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
# include <rccl/rccl.h>
#else
# include <rccl.h>
#endif
#include <atomic>
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <utility>
#if !defined(OMNITRACE_NUM_RCCLP_WRAPPERS)
# define OMNITRACE_NUM_RCCLP_WRAPPERS 25
#endif
TIMEMORY_COMPONENT_ALIAS(
rccl_toolset_t,
component_bundle<rccl_api_t, omnitrace::component::category_region<category::rccl>,
rccl_comm_data*>)
TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
gotcha<OMNITRACE_NUM_RCCLP_WRAPPERS, rccl_toolset_t, rccl_api_t>)
#if !defined(OMNITRACE_USE_RCCL)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
#endif
TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float)
TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t,
true_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t,
true_type)
namespace tim
{
namespace component
{
uint64_t
activate_rcclp();
uint64_t
deactivate_rcclp(uint64_t id);
void
configure_rcclp(const std::set<std::string>& permit = {},
const std::set<std::string>& reject = {});
struct rcclp_handle : base<rcclp_handle, void>
{
static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS;
using value_type = void;
using this_type = rcclp_handle;
using base_type = base<this_type, value_type>;
using rcclp_tuple_t = tim::component_tuple<rcclp_gotcha_t>;
using toolset_ptr_t = std::shared_ptr<rcclp_tuple_t>;
static std::string label() { return "rcclp_handle"; }
static std::string description() { return "Handle for activating NCCL wrappers"; }
static void get() {}
static void start();
static void stop();
static int get_count() { return get_tool_count().load(); }
private:
struct persistent_data
{
std::atomic<short> m_configured{ 0 };
std::atomic<int64_t> m_count{ 0 };
toolset_ptr_t m_tool = toolset_ptr_t{};
};
static persistent_data& get_persistent_data();
static std::atomic<short>& get_configured();
static toolset_ptr_t& get_tool_instance();
static std::atomic<int64_t>& get_tool_count();
};
struct rccl_comm_data : base<rccl_comm_data, void>
{
using value_type = void;
using this_type = rccl_comm_data;
using base_type = base<this_type, value_type>;
using tracker_t = tim::auto_tuple<rccl_data_tracker_t>;
using data_type = float;
TIMEMORY_DEFAULT_OBJECT(rccl_comm_data)
static void preinit();
static void start() {}
static void stop() {}
static auto rccl_type_size(ncclDataType_t datatype)
{
switch(datatype)
{
case ncclInt8:
case ncclUint8: return 1;
case ncclFloat16: return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32: return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64: return 8;
default: return 0;
};
}
// ncclReduce
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t);
// ncclSend
static void audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t);
// ncclBcast
// ncclRecv
static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t);
// ncclBroadcast
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t);
// ncclAllReduce
// ncclReduceScatter
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t);
// ncclAllGather
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
private:
template <typename... Args>
static void add(tracker_t& _t, data_type value, Args&&... args)
{
_t.store(std::plus<data_type>{}, value);
TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward<Args>(args), value));
}
template <typename... Args>
static void add(const gotcha_data& _data, data_type value, Args&&... args)
{
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
add(_t, value, std::forward<Args>(args)...);
}
template <typename... Args>
static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value,
Args&&... args)
{
// if(tim::settings::add_secondary())
{
tracker_t _s{ std::string_view{ _data.tool_id.c_str() } };
add(_s, _data, value, std::forward<Args>(args)...);
}
}
template <typename... Args>
static void add(std::string_view _name, data_type value, Args&&... args)
{
tracker_t _t{ _name };
add(_t, value, std::forward<Args>(args)...);
}
template <typename... Args>
static void add_secondary(tracker_t&, std::string_view _name, data_type value,
Args&&... args)
{
// if(tim::settings::add_secondary())
{
tracker_t _s{ _name };
add(_s, value, std::forward<Args>(args)...);
}
}
};
} // namespace component
} // namespace tim
@@ -24,6 +24,7 @@
#include "library/debug.hpp"
#include "library/defines.hpp"
#include "library/gpu.hpp"
#include "library/mproc.hpp"
#include "library/perfetto.hpp"
#include "library/runtime.hpp"
@@ -46,9 +47,11 @@
#include <csignal>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <limits>
#include <numeric>
#include <ostream>
#include <sstream>
#include <string>
#include <unistd.h>
@@ -270,6 +273,11 @@ configure_settings(bool _init)
"Enable support for Kokkos Tools", false, "kokkos",
"backend");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_USE_RCCLP",
"Enable support for ROCm Communication Collectives Library (RCCL) Performance",
false, "rocm", "rccl", "backend");
OMNITRACE_CONFIG_CL_SETTING(
bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false,
"--omnitrace-kokkos-kernel-logger", "kokkos", "debugging");
@@ -582,12 +590,30 @@ configure_settings(bool _init)
}
if(!_found_sep && _cmd.size() > 1) _cmd.insert(_cmd.begin() + 1, "--");
auto _pid = getpid();
auto _ppid = getppid();
auto _proc = mproc::get_concurrent_processes(_ppid);
bool _main_proc = (_proc.size() < 2 || *_proc.begin() == _pid);
for(auto&& itr :
tim::delimit(_config->get<std::string>("OMNITRACE_CONFIG_FILE"), ";:"))
{
if(_config->get_suppress_config()) continue;
OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str());
_config->read(itr);
if(_config->get<bool>("OMNITRACE_CI") && _main_proc)
{
std::ifstream _in{ itr };
std::stringstream _iss{};
while(_in)
{
std::string _s{};
getline(_in, _s);
_iss << _s << "\n";
}
OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(),
_iss.str().c_str());
}
}
settings::suppress_config() = true;
@@ -666,6 +692,7 @@ configure_mode_settings()
_set("OMNITRACE_USE_ROCTRACER", false);
_set("OMNITRACE_USE_ROCPROFILER", false);
_set("OMNITRACE_USE_KOKKOSP", false);
_set("OMNITRACE_USE_RCCLP", false);
_set("OMNITRACE_USE_OMPT", false);
_set("OMNITRACE_USE_SAMPLING", false);
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
@@ -721,6 +748,7 @@ configure_mode_settings()
_set("OMNITRACE_USE_ROCTRACER", false);
_set("OMNITRACE_USE_ROCPROFILER", false);
_set("OMNITRACE_USE_KOKKOSP", false);
_set("OMNITRACE_USE_RCCLP", false);
_set("OMNITRACE_USE_OMPT", false);
_set("OMNITRACE_USE_SAMPLING", false);
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
@@ -817,6 +845,7 @@ configure_disabled_settings()
_handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto");
_handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory");
_handle_use_option("OMNITRACE_USE_OMPT", "ompt");
_handle_use_option("OMNITRACE_USE_RCCLP", "rcclp");
_handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi");
_handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer");
_handle_use_option("OMNITRACE_USE_ROCPROFILER", "rocprofiler");
@@ -1355,6 +1384,13 @@ get_use_code_coverage()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_use_rcclp()
{
static auto _v = get_config()->find("OMNITRACE_USE_RCCLP");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_critical_trace_debug()
{
@@ -216,6 +216,9 @@ get_use_sampling_cputime();
int
get_sampling_rtoffset();
bool
get_use_rcclp();
bool
get_timeline_sampling();
@@ -83,6 +83,8 @@
perfetto::Category("kokkos").SetDescription("Kokkos regions"), \
perfetto::Category("mpi").SetDescription("MPI regions"), \
perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \
perfetto::Category("rccl").SetDescription( \
"ROCm Communication Collectives Library (RCCL) regions"), \
perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \
perfetto::Category("host-critical-trace") \
.SetDescription("Host-side critical traces"), \
@@ -0,0 +1,88 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/rcclp.hpp"
#include "library/components/category_region.hpp"
#include "library/components/fwd.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
#include <timemory/timemory.hpp>
#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
# include <rccl/rccl.h>
#else
# include <rccl.h>
#endif
#include <dlfcn.h>
#include <limits>
#include <memory>
#include <set>
#include <unordered_map>
static uint64_t global_id = std::numeric_limits<uint64_t>::max();
static void* librccl_handle = nullptr;
namespace omnitrace
{
namespace rcclp
{
void
configure()
{
comp::rccl_data_tracker_t::label() = "rccl_comm_data";
comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data";
}
void
setup()
{
configure();
// make sure the symbols are loaded to be wrapped
auto libpath = tim::get_env<std::string>("OMNITRACE_RCCL_LIBRARY", "librccl.so");
librccl_handle = dlopen(libpath.c_str(), RTLD_NOW | RTLD_GLOBAL);
if(!librccl_handle) fprintf(stderr, "%s\n", dlerror());
dlerror(); // Clear any existing error
auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true);
if(_data)
comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) {
cb.initialize<comp::rccl_comm_data>();
};
comp::configure_rcclp();
global_id = comp::activate_rcclp();
if(librccl_handle) dlclose(librccl_handle);
}
void
shutdown()
{
if(global_id < std::numeric_limits<uint64_t>::max())
comp::deactivate_rcclp(global_id);
}
} // namespace rcclp
} // namespace omnitrace
@@ -0,0 +1,57 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "library/defines.hpp"
namespace omnitrace
{
namespace rcclp
{
void
configure();
void
setup();
void
shutdown();
#if !defined(OMNITRACE_USE_RCCL) || \
(defined(OMNITRACE_USE_RCCL) && OMNITRACE_USE_RCCL == 0)
inline void
configure()
{}
inline void
setup()
{}
inline void
shutdown()
{}
#endif
} // namespace rcclp
} // namespace omnitrace
@@ -0,0 +1,522 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 11
#define NCCL_PATCH 4
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 21104
#define NCCL_VERSION(X, Y, Z) \
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#ifdef __cplusplus
extern "C"
{
#endif
/*! @brief Opaque handle to communicator */
typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct
{
char internal[NCCL_UNIQUE_ID_BYTES];
} ncclUniqueId;
/*! @brief Error type */
typedef enum
{
ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclNumResults = 6
} ncclResult_t;
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
*
* @details This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
ncclResult_t ncclGetVersion(int* version);
/// @cond include_hidden
ncclResult_t pncclGetVersion(int* version);
/// @endcond
/*! @brief Generates an ID for ncclCommInitRank
@details
Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
called once and the Id should be distributed to all ranks in the
communicator before calling ncclCommInitRank.
@param[in]
uniqueId ncclUniqueId*
pointer to uniqueId
*/
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
/// @cond include_hidden
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/// @endcond
/*! @brief Creates a new communicator (multi thread/process version).
@details
rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank.
ncclCommInitRank implicitly syncronizes with other ranks, so it must be
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@param[in]
comm ncclComm_t*
communicator struct pointer
*/
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
int rank);
/// @cond include_hidden
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
int rank);
/// @endcond
/*! @brief Creates a clique of communicators (single process version).
*
* @details This is a convenience function to create a single-process communicator
* clique. Returns an array of ndev newly initialized communicators in comm. comm
* should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is
* NULL, the first ndev HIP devices are used. Order of devlist defines user-order of
* processors within the communicator.
* */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/// @cond include_hidden
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/// @endcond
/*! @brief Frees resources associated with communicator object, but waits for any
* operations that might still be running on the device */
ncclResult_t ncclCommDestroy(ncclComm_t comm);
/// @cond include_hidden
ncclResult_t pncclCommDestroy(ncclComm_t comm);
/// @endcond
/*! @brief Frees resources associated with communicator object and aborts any
* operations that might still be running on the device. */
ncclResult_t ncclCommAbort(ncclComm_t comm);
/// @cond include_hidden
ncclResult_t pncclCommAbort(ncclComm_t comm);
/// @endcond
/*! @brief Returns a human-readable error message. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
/*! @brief Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
/// @cond include_hidden
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
/// @endcond
/*! @brief Gets the number of ranks in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
/// @cond include_hidden
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
/// @endcond
/*! @brief Returns the rocm device number associated with the communicator. */
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
/// @cond include_hidden
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
/// @endcond
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
/// @cond include_hidden
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/// @endcond
/*! @brief Reduction operation selector */
/* Reduction operation selector */
typedef enum
{
ncclNumOps_dummy = 5
} ncclRedOp_dummy_t;
typedef enum
{
ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
ncclAvg = 4,
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
* serves as the least possible value for dynamic ncclRedOp_t's
* as constructed by ncclRedOpCreate*** functions. */
ncclNumOps = 5,
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
* It is defined to be the largest signed value (since compilers
* are permitted to use signed enums) that won't grow
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
* maintain ABI compatibility. */
ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t))
} ncclRedOp_t;
/*! @brief Data types */
typedef enum
{
ncclInt8 = 0,
ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2,
ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6,
ncclHalf = 6,
ncclFloat32 = 7,
ncclFloat = 7,
ncclFloat64 = 8,
ncclDouble = 8,
ncclBfloat16 = 9,
ncclNumTypes = 10
} ncclDataType_t;
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
typedef enum
{
/* ncclScalarDevice: The scalar is in device-visible memory and will be
* dereferenced while the collective is running. */
ncclScalarDevice = 0,
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
* dereferenced before the ncclRedOpCreate***() function returns. */
ncclScalarHostImmediate = 1
} ncclScalarResidence_t;
/*
* ncclRedOpCreatePreMulSum
*
* Creates a new reduction operator which pre-multiplies input values by a given
* scalar locally before reducing them with peer values via summation. For use
* only with collectives launched against *comm* and *datatype*. The
* *residence* argument indicates how/when the memory pointed to by *scalar*
* will be dereferenced. Upon return, the newly created operator's handle
* is stored in *op*.
*/
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype,
ncclScalarResidence_t residence,
ncclComm_t comm);
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype,
ncclScalarResidence_t residence,
ncclComm_t comm);
/*
* ncclRedOpDestroy
*
* Destroys the reduction operator *op*. The operator must have been created by
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
* destroyed as soon as the last NCCL function which is given that operator returns.
*/
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
/*
* Collective communication operations
*
* Collective communication operations must be called separately for each
* communicator in a communicator clique.
*
* They return when operations have been enqueued on the CUDA stream.
*
* Since they may perform inter-CPU synchronization, each call has to be done
* from a different thread or process, or need to use Group Semantics (see
* below).
*/
/*!
* @brief Reduce
*
* @details Reduces data arrays of length count in sendbuff into recvbuff using op
* operation.
* recvbuff may be NULL on all calls except for root device.
* root is the rank (not the CUDA device) where data will reside after the
* operation is complete.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root,
ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief (deprecated) Broadcast (in-place)
*
* @details Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* This operation is implicitely in place.
*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Broadcast
*
* @details Copies count values from root to all other devices.
* root is the rank (not the HIP device) where data resides before the
* operation is started.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-Reduce
*
* @details Reduces data arrays of length count in sendbuff using op operation, and
* leaves identical copies of result on each recvbuff.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*!
* @brief Reduce-Scatter
*
* @details Reduces data in sendbuff using op operation and leaves reduced result
* scattered over the devices so that recvbuff on rank i will contain the i-th
* block of the result.
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype,
ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief All-Gather
*
* @details Each device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief Send
*
* @details Send data from sendbuff to rank peer.
* Rank peer needs to call ncclRecv with the same datatype and the same count from
* this rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
* operations need to progress concurrently to complete, they must be fused within a
* ncclGroupStart/ ncclGroupEnd section.
*/
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Receive
*
* @details Receive data from rank peer into recvbuff.
* Rank peer needs to call ncclSend with the same datatype and the same count to this
* rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
* operations need to progress concurrently to complete, they must be fused within a
* ncclGroupStart/ ncclGroupEnd section.
*/
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Gather
*
* @details Root device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
*
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief Scatter
*
* @details Scattered over the devices so that recvbuff on rank i will contain the
* i-th block of the data on root.
*
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-To-All
*
* @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
* block. Each block for sending/receiving has count elements, which means
* that recvbuff and sendbuff should have a size of nranks*count elements.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-To-Allv
*
* @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
* to device (j). In the same time, device (i) receives recvcounts[j] of data
* from device (j) to be placed at rdispls[j].
* sendcounts, sdispls, recvcounts and rdispls are all measured in the units
* of datatype, not bytes.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff,
const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff,
const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*
* Group semantics
*
* When managing multiple GPUs from a single thread, and since NCCL collective
* calls may perform inter-CPU synchronization, we need to "group" calls for
* different ranks/devices into a single call.
*
* Grouping NCCL calls as being part of the same collective operation is done
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
* collective calls until the ncclGroupEnd call, which will wait for all calls
* to be complete. Note that for collective communication, ncclGroupEnd only
* guarantees that the operations are enqueued on the streams, not that
* the operation is effectively done.
*
* Both collective communication and ncclCommInitRank can be used in conjunction
* of ncclGroupStart/ncclGroupEnd, but not together.
*
* Group semantics also allow to fuse multiple operations on the same device
* to improve performance (for aggregated collective calls), or to permit
* concurrent progress of multiple send/receive operations.
*/
/*! @brief Group Start
*
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
* a single NCCL operation. Nothing will be started on the CUDA stream until
* ncclGroupEnd.
*/
ncclResult_t ncclGroupStart();
/// @cond include_hidden
ncclResult_t pncclGroupStart();
/// @endcond
/*! @brief Group End
*
* End a group call. Start a fused NCCL operation consisting of all calls since
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
* need to be called after ncclGroupEnd.
*/
ncclResult_t ncclGroupEnd();
/// @cond include_hidden
ncclResult_t pncclGroupEnd();
/// @endcond
#ifdef __cplusplus
} // end extern "C"
#endif
#endif // end include guard
+272 -87
查看文件
@@ -12,6 +12,11 @@ if(NOT DEFINED NUM_PROCS)
set(NUM_PROCS 2)
endif()
math(EXPR NUM_SAMPLING_PROCS "${NUM_PROCS_REAL}-1")
if(NUM_SAMPLING_PROCS GREATER 3)
set(NUM_SAMPLING_PROCS 3)
endif()
math(EXPR NUM_THREADS "${NUM_PROCS_REAL} + (${NUM_PROCS_REAL} / 2)")
if(NUM_THREADS GREATER 12)
set(NUM_THREADS 12)
@@ -30,8 +35,8 @@ set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREAD
set(_base_environment
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
"${_test_library_path}")
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_FILE_OUTPUT=ON" "${_test_openmp_env}" "${_test_library_path}")
set(_flat_environment
"OMNITRACE_USE_PERFETTO=ON"
@@ -43,11 +48,8 @@ set(_flat_environment
"OMNITRACE_COLLAPSE_PROCESSES=ON"
"OMNITRACE_COLLAPSE_THREADS=ON"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
"OMP_PROC_BIND=spread"
"OMP_PLACES=threads"
"OMP_NUM_THREADS=2"
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
)
"${_test_openmp_env}"
"${_test_library_path}")
set(_lock_environment
"OMNITRACE_USE_SAMPLING=OFF"
@@ -62,18 +64,34 @@ set(_lock_environment
"${_test_library_path}")
set(_ompt_environment
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_USE_OMPT=ON" "OMNITRACE_CRITICAL_TRACE=OFF" "${_test_openmp_env}"
"OMNITRACE_USE_PERFETTO=ON"
"OMNITRACE_USE_TIMEMORY=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_USE_OMPT=ON"
"OMNITRACE_CRITICAL_TRACE=OFF"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss"
"${_test_openmp_env}"
"${_test_library_path}")
set(_perfetto_environment
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=OFF" "OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
"OMNITRACE_USE_PERFETTO=ON"
"OMNITRACE_USE_TIMEMORY=OFF"
"OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_PERFETTO_BACKEND=inprocess"
"OMNITRACE_PERFETTO_FILL_POLICY=ring_buffer"
"${_test_openmp_env}"
"${_test_library_path}")
set(_timemory_environment
"OMNITRACE_USE_PERFETTO=OFF" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
"OMNITRACE_USE_PERFETTO=OFF"
"OMNITRACE_USE_TIMEMORY=ON"
"OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss"
"${_test_openmp_env}"
"${_test_library_path}")
set(_test_environment ${_base_environment} "OMNITRACE_CRITICAL_TRACE=OFF")
@@ -86,7 +104,7 @@ set(_python_environment
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_TREE_OUTPUT=OFF"
"OMNITRACE_USE_PID=OFF"
"OMNITRACE_TIMEMORY_COMPONENTS=trip_count"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
"${_test_library_path}"
"PYTHONPATH=${PROJECT_BINARY_DIR}/lib/python/site-packages")
@@ -100,6 +118,18 @@ set(_attach_environment
"OMNITRACE_USE_KOKKOSP=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_USE_PID=OFF"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
"${_test_openmp_env}"
"${_test_library_path}")
set(_rccl_environment
"OMNITRACE_USE_PERFETTO=ON"
"OMNITRACE_USE_TIMEMORY=ON"
"OMNITRACE_USE_SAMPLING=OFF"
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
"OMNITRACE_USE_RCCLP=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_USE_PID=OFF"
"${_test_openmp_env}"
"${_test_library_path}")
@@ -123,6 +153,83 @@ endif()
# -------------------------------------------------------------------------------------- #
set(_VALID_GPU OFF)
if(OMNITRACE_USE_HIP)
set(_VALID_GPU ON)
find_program(
OMNITRACE_ROCM_SMI_EXE
NAMES rocm-smi
HINTS ${ROCmVersion_DIR}
PATHS ${ROCmVersion_DIR}
PATH_SUFFIXES bin)
if(OMNITRACE_ROCM_SMI_EXE)
execute_process(
COMMAND ${OMNITRACE_ROCM_SMI_EXE}
OUTPUT_VARIABLE _RSMI_OUT
ERROR_VARIABLE _RSMI_ERR
RESULT_VARIABLE _RSMI_RET)
if(_RSMI_RET EQUAL 0)
if("${_RSMI_OUTPUT}" MATCHES "ERROR" OR "${_RSMI_ERR}" MATCHES "ERROR")
set(_VALID_GPU OFF)
endif()
else()
set(_VALID_GPU OFF)
endif()
endif()
if(NOT _VALID_GPU)
omnitrace_message(AUTHOR_WARNING
"rocm-smi did not successfully run. Disabling GPU tests...")
endif()
endif()
set(LULESH_USE_GPU ${LULESH_USE_HIP})
if(LULESH_USE_CUDA)
set(LULESH_USE_GPU ON)
endif()
# -------------------------------------------------------------------------------------- #
function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV)
set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS)=")
set(_FILE_CONTENTS)
set(_ENV_CONTENTS)
foreach(_VAL ${${_ENV}})
if("${_VAL}" MATCHES "^OMNITRACE_" AND NOT "${_VAL}" MATCHES "${_ENV_ONLY}")
set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n")
else()
list(APPEND _ENV_CONTENTS "${_VAL}")
endif()
endforeach()
set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitrace-tests-config/${_FILE})
file(
WRITE ${_CONFIG_FILE}
"# auto-generated by cmake
# default values
OMNITRACE_CI = ON
OMNITRACE_VERBOSE = 1
OMNITRACE_DL_VERBOSE = 1
OMNITRACE_SAMPLING_FREQ = 50
OMNITRACE_SAMPLING_DELAY = 0.05
OMNITRACE_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS}
OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES
OMNITRACE_ROCTRACER_HSA_API = ON
OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON
# test-specific values
${_FILE_CONTENTS}
")
list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}")
list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1")
set(${_ENV}
"${_ENV_CONTENTS}"
PARENT_SCOPE)
endfunction()
# -------------------------------------------------------------------------------------- #
function(OMNITRACE_ADD_TEST)
foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN)
foreach(_TYPE PASS FAIL SKIP)
@@ -135,10 +242,17 @@ function(OMNITRACE_ADD_TEST)
cmake_parse_arguments(
TEST
"SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options
"NAME;TARGET;MPI;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value args
"NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value
# args
"${_KWARGS}" # multiple value args
${ARGN})
if(TEST_GPU AND NOT _VALID_GPU)
omnitrace_message(STATUS
"${TEST_NAME} requires a GPU and no valid GPUs were found")
return()
endif()
if("${TEST_MPI}" STREQUAL "")
set(TEST_MPI OFF)
endif()
@@ -189,7 +303,7 @@ function(OMNITRACE_ADD_TEST)
add_test(
NAME ${TEST_NAME}-baseline
COMMAND ${COMMAND_PREFIX} $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
endif()
if(NOT TEST_SKIP_REWRITE)
@@ -199,7 +313,7 @@ function(OMNITRACE_ADD_TEST)
$<TARGET_FILE:omnitrace-exe> -o
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.inst
${TEST_REWRITE_ARGS} -- $<TARGET_FILE:${TEST_TARGET}>
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
if(NOT TEST_SKIP_SAMPLING)
add_test(
@@ -208,7 +322,7 @@ function(OMNITRACE_ADD_TEST)
$<TARGET_FILE:omnitrace-exe> -o
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.samp -M sampling
${TEST_REWRITE_ARGS} -- $<TARGET_FILE:${TEST_TARGET}>
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
endif()
add_test(
@@ -216,7 +330,7 @@ function(OMNITRACE_ADD_TEST)
COMMAND
${COMMAND_PREFIX} $<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.inst
${TEST_RUN_ARGS}
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
if(NOT TEST_SKIP_SAMPLING)
add_test(
@@ -225,7 +339,7 @@ function(OMNITRACE_ADD_TEST)
${COMMAND_PREFIX}
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.samp
${TEST_RUN_ARGS}
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
endif()
endif()
@@ -234,14 +348,14 @@ function(OMNITRACE_ADD_TEST)
NAME ${TEST_NAME}-runtime-instrument
COMMAND $<TARGET_FILE:omnitrace-exe> ${TEST_RUNTIME_ARGS} --
$<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
if(NOT TEST_SKIP_SAMPLING)
add_test(
NAME ${TEST_NAME}-runtime-instrument-sampling
COMMAND $<TARGET_FILE:omnitrace-exe> -M sampling ${TEST_RUNTIME_ARGS}
-- $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
endif()
endif()
@@ -303,6 +417,7 @@ function(OMNITRACE_ADD_TEST)
endforeach()
if(TEST ${TEST_NAME}-${_TEST})
omnitrace_write_test_config(${TEST_NAME}-${_TEST}.cfg _environ)
set_tests_properties(
${TEST_NAME}-${_TEST}
PROPERTIES ENVIRONMENT
@@ -431,6 +546,7 @@ endfunction()
# general config file tests
#
# -------------------------------------------------------------------------------------- #
file(
WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg
"
@@ -476,6 +592,7 @@ omnitrace_add_test(
NAME transpose
TARGET transpose
MPI ${TRANSPOSE_USE_MPI}
GPU ON
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution
RUNTIME_ARGS
@@ -497,6 +614,7 @@ omnitrace_add_test(
TARGET transpose
LABELS "loops"
MPI ${TRANSPOSE_USE_MPI}
GPU ON
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS
-e
@@ -577,78 +695,82 @@ omnitrace_add_test(
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF"
REWRITE_FAIL_REGEX "0 instrumented loops in procedure")
omnitrace_add_test(
SKIP_RUNTIME
NAME "mpi"
TARGET mpi-example
MPI ON
NUM_PROCS 4
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1"
REWRITE_RUN_PASS_REGEX
"(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')"
REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)")
if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS)
omnitrace_add_test(
SKIP_RUNTIME
NAME "mpi"
TARGET mpi-example
MPI ON
NUM_PROCS 4
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1"
REWRITE_RUN_PASS_REGEX
"(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')"
REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)")
omnitrace_add_test(
SKIP_RUNTIME SKIP_SAMPLING
NAME "mpi-flat-mpip"
TARGET mpi-example
MPI ON
NUM_PROCS 4
LABELS "mpip"
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_USE_MPIP=ON"
REWRITE_RUN_PASS_REGEX
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
)
omnitrace_add_test(
SKIP_RUNTIME SKIP_SAMPLING
NAME "mpi-flat-mpip"
TARGET mpi-example
MPI ON
NUM_PROCS 4
LABELS "mpip"
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT
"${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON"
REWRITE_RUN_PASS_REGEX
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
)
omnitrace_add_test(
SKIP_RUNTIME SKIP_SAMPLING
NAME "mpi-flat"
TARGET mpi-example
MPI ON
NUM_PROCS 4
LABELS "mpip"
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF"
REWRITE_RUN_PASS_REGEX
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
)
omnitrace_add_test(
SKIP_RUNTIME SKIP_SAMPLING
NAME "mpi-flat"
TARGET mpi-example
MPI ON
NUM_PROCS 4
LABELS "mpip"
REWRITE_ARGS
-e
-v
2
--label
file
line
return
args
--min-instructions
0
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF"
REWRITE_RUN_PASS_REGEX
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
)
endif()
omnitrace_add_test(
NAME lulesh
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos"
REWRITE_ARGS -e -v 2 --label file line return args
@@ -674,6 +796,7 @@ omnitrace_add_test(
NAME lulesh-baseline-kokkosp-libomnitrace
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos;kokkos-profile-library"
RUN_ARGS -i 10 -s 20 -p
@@ -686,6 +809,7 @@ omnitrace_add_test(
NAME lulesh-baseline-kokkosp-libomnitrace-dl
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos;kokkos-profile-library"
RUN_ARGS -i 10 -s 20 -p
@@ -698,6 +822,7 @@ omnitrace_add_test(
NAME lulesh-kokkosp
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos"
REWRITE_ARGS -e -v 2
@@ -721,6 +846,7 @@ omnitrace_add_test(
NAME lulesh-perfetto
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos;loops"
REWRITE_ARGS -e -v 2
@@ -743,6 +869,7 @@ omnitrace_add_test(
NAME lulesh-timemory
TARGET lulesh
MPI ${LULESH_USE_MPI}
GPU ${LULESH_USE_GPU}
NUM_PROCS 8
LABELS "kokkos;loops"
REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping
@@ -965,6 +1092,64 @@ if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE)
"Dyninst was unable to attach to the specified process")
endif()
# -------------------------------------------------------------------------------------- #
#
# rccl tests
#
# -------------------------------------------------------------------------------------- #
foreach(_TARGET ${RCCL_TEST_TARGETS})
string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}")
string(REPLACE "_" "-" _NAME "${_NAME}")
omnitrace_add_test(
SKIP_SAMPLING
NAME rccl-test-${_NAME}
TARGET ${_TARGET}
LABELS "rccl-tests;rcclp"
MPI ON
GPU ON
NUM_PROCS 1
REWRITE_ARGS
-e
-v
2
-i
8
--label
file
line
return
args
RUNTIME_ARGS
-e
-v
1
-i
8
--label
file
line
return
args
RUN_ARGS -t
1
-g
1
-i
10
-w
2
-m
2
-p
-c
1
-z
-s
1
ENVIRONMENT "${_rccl_environment}")
endforeach()
# -------------------------------------------------------------------------------------- #
#
# python tests