RCCL support (#93)
* Initial support for RCCL
* OMNITRACE_USE_RCCLP + sampling tweaks
- also OMNITRACE_SAMPLING_KEEP_INTERNAL option
- minor modifications to sampling to use keep internal option + discard funlockfile
* Update docker and workflows to download RCCL
* Update CPack DEB with rocprofiler dependency
* Rework rccl into library and library/components folder
- add tpls/rccl/rccl/rccl.h
* Fix timemory includes
* rcclp inline definitions when disabled
* Tweaks to ubuntu-focal-external-rocm
- disable ompt
- enable building testing
* Tweaks to ubuntu-focal-external-rocm
- ctest exclude
* Tweak ubuntu-focal.yml
- remove source /.../setup-env.sh, replace with $GITHUB_ENV
* Fix ubuntu-focal-rocm + OMPI + root
* Improved rocm-smi error handling
- Recover from rocm-smi errors
- Disabling rocm-smi after recovering from errors
- Werror in developer mode
- Remove State::DelayedInit
- Add State::Disabled
* formatting
* Fix merge of OMNITRACE_SAMPLING_KEEP_INTERNAL
* Update RCCL include directory
- based on ROCm version we need with <rccl/rccl.h> or <rccl.h>
* RCCL Testing
- updated tests to use configuration files
- many tests generate a configuration file
- tests how have GPU option
- enable ncclCommCount, disable ncclGetVersion
- add testing for RCCLP via rccl-tests
- working directory of tests is PROJECT_BINARY_DIR
- add nccl/rccl functions to get_whole_function_names
- some clang compiler fixes
* Handle RCCL include w/o HIP
* RCCL requires HIP
* Update OMNITRACE_SAMPLING_CPUS for testing
* Update tests/CMakeLists.txt
* Debug settings
* Install MPI even when USE_MPI=OFF
* exclude printf
* skip mpi tests w/o USE_MPI or USE_MPI_HEADERS
* update ubuntu rocm workflow
* Fix configure env step for ubuntu rocm
[ROCm/rocprofiler-systems commit: 45be03906a]
这个提交包含在:
@@ -28,6 +28,7 @@ parse:
|
||||
NAME: '*'
|
||||
TARGET: '*'
|
||||
MPI: '*'
|
||||
GPU: '*'
|
||||
NUM_PROCS: '*'
|
||||
REWRITE_TIMEOUT: '*'
|
||||
RUNTIME_TIMEOUT: '*'
|
||||
|
||||
@@ -121,5 +121,6 @@ jobs:
|
||||
with:
|
||||
name: data-${{ github.job }}-files
|
||||
path: |
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
@@ -150,5 +150,6 @@ jobs:
|
||||
with:
|
||||
name: data-${{ github.job }}-files
|
||||
path: |
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
add-apt-repository -y ppa:ubuntu-toolchain-r/test &&
|
||||
apt-get update &&
|
||||
apt-get upgrade -y &&
|
||||
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} &&
|
||||
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy &&
|
||||
python3 -m pip install perfetto &&
|
||||
@@ -166,6 +166,7 @@ jobs:
|
||||
with:
|
||||
name: data-${{ github.job }}-files
|
||||
path: |
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
@@ -176,12 +177,23 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
compiler: ['g++']
|
||||
rocm_version: ['4.3', '4.5', 'debian']
|
||||
rocm_version: ['4.3', '4.5', '5.0']
|
||||
mpi_headers: ['OFF']
|
||||
build_jobs: ['4']
|
||||
ctest_exclude: ['-LE "mpi-example|transpose"']
|
||||
perfetto-tools: ['ON']
|
||||
include:
|
||||
- compiler: 'g++'
|
||||
rocm_version: 'debian'
|
||||
mpi_headers: 'ON'
|
||||
build_jobs: '2'
|
||||
ctest_exclude: '-LE transpose'
|
||||
perfetto-tools: 'OFF'
|
||||
|
||||
env:
|
||||
BUILD_TYPE: MinSizeRel
|
||||
OMNITRACE_OUTPUT_PATH: omnitrace-tests-output
|
||||
OMNITRACE_OUTPUT_PREFIX: "%argt%/"
|
||||
OMPI_ALLOW_RUN_AS_ROOT: 1
|
||||
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
@@ -194,18 +206,41 @@ jobs:
|
||||
wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&
|
||||
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list &&
|
||||
apt-get update &&
|
||||
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl &&
|
||||
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install 'cmake==3.16.3' &&
|
||||
for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
python3 -m pip install 'cmake==3.21.4' &&
|
||||
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install RCCL
|
||||
if: ${{ matrix.rocm_version != '4.3' }}
|
||||
timeout-minutes: 5
|
||||
run:
|
||||
apt-get install -y rccl-dev
|
||||
|
||||
- name: Configure Env
|
||||
run:
|
||||
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV &&
|
||||
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV &&
|
||||
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV &&
|
||||
echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH &&
|
||||
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
|
||||
run: |
|
||||
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV
|
||||
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV
|
||||
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
|
||||
echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
|
||||
cat << EOF > test-install.cfg
|
||||
OMNITRACE_USE_TIMEMORY = ON
|
||||
OMNITRACE_USE_PERFETTO = ON
|
||||
OMNITRACE_USE_PID = OFF
|
||||
OMNITRACE_USE_SAMPLING = OFF
|
||||
OMNITRACE_USE_PROCESS_SAMPLING = OFF
|
||||
OMNITRACE_COUT_OUTPUT = ON
|
||||
OMNITRACE_TIME_OUTPUT = OFF
|
||||
OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char
|
||||
OMNITRACE_OUTPUT_PATH = omnitrace-tests-output
|
||||
OMNITRACE_OUTPUT_PREFIX = %tag%/
|
||||
OMNITRACE_DEBUG = OFF
|
||||
OMNITRACE_VERBOSE = 3
|
||||
OMNITRACE_DL_VERBOSE = 3
|
||||
OMNITRACE_PERFETTO_BACKEND = system
|
||||
EOF
|
||||
realpath test-install.cfg
|
||||
cat test-install.cfg
|
||||
|
||||
- name: Configure CMake
|
||||
timeout-minutes: 10
|
||||
@@ -217,22 +252,27 @@ jobs:
|
||||
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
|
||||
-DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }}
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/omnitrace
|
||||
-DOMNITRACE_BUILD_TESTING=OFF
|
||||
-DOMNITRACE_BUILD_TESTING=ON
|
||||
-DOMNITRACE_BUILD_DEVELOPER=ON
|
||||
-DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF
|
||||
-DOMNITRACE_BUILD_LTO=OFF
|
||||
-DOMNITRACE_USE_MPI=OFF
|
||||
-DOMNITRACE_USE_MPI_HEADERS=ON
|
||||
-DOMNITRACE_USE_HIP=ON
|
||||
-DOMNITRACE_MAX_THREADS=32
|
||||
-DOMNITRACE_USE_SANITIZER=OFF
|
||||
-DOMNITRACE_USE_PAPI=OFF
|
||||
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON
|
||||
-DOMNITRACE_USE_OMPT=OFF
|
||||
-DOMNITRACE_USE_PYTHON=ON
|
||||
-DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }}
|
||||
-DOMNITRACE_USE_SANITIZER=OFF
|
||||
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }}
|
||||
-DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs
|
||||
-DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10"
|
||||
-DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }}
|
||||
|
||||
- name: Build
|
||||
timeout-minutes: 60
|
||||
run:
|
||||
cmake --build build --target all --parallel 2 -- VERBOSE=1
|
||||
cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1
|
||||
|
||||
- name: Install
|
||||
run:
|
||||
@@ -244,20 +284,28 @@ jobs:
|
||||
cd build &&
|
||||
ldd ./bin/omnitrace &&
|
||||
./bin/omnitrace --help &&
|
||||
ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
|
||||
ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
|
||||
ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
|
||||
ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
|
||||
|
||||
- name: Configure Install Env
|
||||
run: |
|
||||
echo "/opt/omnitrace/bin" >> $GITHUB_PATH
|
||||
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
|
||||
echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV
|
||||
|
||||
- name: Test Install
|
||||
timeout-minutes: 10
|
||||
if: ${{ matrix.perfetto-tools == 'ON' }}
|
||||
run: |
|
||||
set -v
|
||||
cat ${OMNITRACE_CONFIG_FILE}
|
||||
omnitrace-perfetto-traced --background
|
||||
export OMNITRACE_DEBUG=ON
|
||||
export OMNITRACE_PERFETTO_BACKEND=system
|
||||
which omnitrace-avail
|
||||
ldd $(which omnitrace-avail)
|
||||
omnitrace-avail --help
|
||||
omnitrace-avail -a
|
||||
which omnitrace-python
|
||||
omnitrace-python --help
|
||||
which omnitrace-critical-trace
|
||||
ldd $(which omnitrace-critical-trace)
|
||||
which omnitrace
|
||||
@@ -272,6 +320,31 @@ jobs:
|
||||
du -m ls-perfetto-trace.proto
|
||||
/opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto
|
||||
|
||||
- name: Test Install
|
||||
timeout-minutes: 10
|
||||
if: ${{ matrix.perfetto-tools == 'OFF' }}
|
||||
run: |
|
||||
set -v
|
||||
cat ${OMNITRACE_CONFIG_FILE}
|
||||
which omnitrace-avail
|
||||
ldd $(which omnitrace-avail)
|
||||
omnitrace-avail --help
|
||||
omnitrace-avail -a
|
||||
which omnitrace-python
|
||||
omnitrace-python --help
|
||||
which omnitrace-critical-trace
|
||||
ldd $(which omnitrace-critical-trace)
|
||||
which omnitrace
|
||||
ldd $(which omnitrace)
|
||||
omnitrace --help
|
||||
omnitrace -e -v 1 -o sleep.inst --simulate -- sleep
|
||||
omnitrace -e -v 1 --simulate -- sleep
|
||||
omnitrace -e -v 1 -o sleep.inst -- sleep
|
||||
./sleep.inst 5
|
||||
omnitrace -e -v 1 -- sleep 5
|
||||
cat omnitrace-tests-output/sleep.inst/wall_clock.txt
|
||||
cat omnitrace-tests-output/sleep/wall_clock.txt
|
||||
|
||||
- name: Test User API
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
@@ -293,6 +366,7 @@ jobs:
|
||||
name: data-${{ github.job }}-files
|
||||
path: |
|
||||
omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
@@ -445,5 +519,6 @@ jobs:
|
||||
with:
|
||||
name: data-${{ github.job }}-files
|
||||
path: |
|
||||
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
|
||||
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
|
||||
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
@@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support"
|
||||
omnitrace_add_option(
|
||||
OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
|
||||
${OMNITRACE_USE_HIP})
|
||||
omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP})
|
||||
omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS
|
||||
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
|
||||
omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON)
|
||||
@@ -175,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP)
|
||||
set(OMNITRACE_USE_ROCM_SMI
|
||||
OFF
|
||||
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
|
||||
set(OMNITRACE_USE_RCCL
|
||||
OFF
|
||||
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
|
||||
elseif(
|
||||
OMNITRACE_USE_HIP
|
||||
AND NOT OMNITRACE_USE_ROCTRACER
|
||||
AND NOT OMNITRACE_USE_ROCPROFILER
|
||||
AND NOT OMNITRACE_USE_ROCM_SMI)
|
||||
AND NOT OMNITRACE_USE_ROCM_SMI
|
||||
AND NOT OMNITRACE_USE_RCCL)
|
||||
omnitrace_message(
|
||||
AUTHOR_WARNING
|
||||
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled"
|
||||
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
|
||||
)
|
||||
set(OMNITRACE_USE_HIP OFF)
|
||||
endif()
|
||||
|
||||
@@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST)
|
||||
endif()
|
||||
endif()
|
||||
if(ROCmVersion_FOUND)
|
||||
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
set(_ROCM_SMI_SUFFIX
|
||||
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
|
||||
@@ -167,6 +168,9 @@ endif()
|
||||
if(OMNITRACE_USE_ROCTRACER)
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
|
||||
endif()
|
||||
if(OMNITRACE_USE_ROCPROFILER)
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
|
||||
endif()
|
||||
if(OMNITRACE_USE_MPI)
|
||||
if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi")
|
||||
list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev")
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
|
||||
# Copyright.txt or https://cmake.org/licensing for details.
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
set(RCCL-Headers_INCLUDE_DIR_INTERNAL
|
||||
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl"
|
||||
CACHE PATH "Path to internal rccl.h")
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
|
||||
foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
|
||||
if(EXISTS ${_DIR})
|
||||
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
|
||||
list(APPEND _RCCL_PATHS ${_ABS_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
find_package(
|
||||
rccl
|
||||
QUIET
|
||||
CONFIG
|
||||
HINTS
|
||||
${_RCCL_PATHS}
|
||||
PATHS
|
||||
${_RCCL_PATHS}
|
||||
PATH_SUFFIXES
|
||||
rccl/lib/cmake)
|
||||
|
||||
if(NOT rccl_FOUND)
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
|
||||
CACHE PATH "Path to RCCL headers")
|
||||
else()
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${rccl_INCLUDE_DIR}"
|
||||
CACHE PATH "Path to RCCL headers")
|
||||
endif()
|
||||
|
||||
# because of the annoying warning starting with v5.2.0, we've got to do this crap
|
||||
if(ROCmVersion_NUMERIC_VERSION)
|
||||
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
|
||||
set(_RCCL-Headers_FILE "rccl.h")
|
||||
set(_RCCL-Headers_DIR "/rccl")
|
||||
else()
|
||||
set(_RCCL-Headers_FILE "rccl/rccl.h")
|
||||
set(_RCCL-Headers_DIR "")
|
||||
endif()
|
||||
else()
|
||||
set(_RCCL-Headers_FILE "rccl/rccl.h")
|
||||
set(_RCCL-Headers_DIR "")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
|
||||
omnitrace_message(
|
||||
AUTHOR_WARNING
|
||||
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
|
||||
)
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
|
||||
CACHE PATH "Path to RCCL headers" FORCE)
|
||||
endif()
|
||||
|
||||
unset(_RCCL-Headers_FILE)
|
||||
unset(_RCCL-Headers_DIR)
|
||||
|
||||
mark_as_advanced(RCCL-Headers_INCLUDE_DIR)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
|
||||
if(RCCL-Headers_FOUND)
|
||||
add_library(roc::rccl-headers INTERFACE IMPORTED)
|
||||
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})
|
||||
|
||||
target_include_directories(roc::rccl-headers SYSTEM
|
||||
INTERFACE ${RCCL-Headers_INCLUDE_DIR})
|
||||
|
||||
add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
|
||||
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
|
||||
endif()
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
@@ -20,6 +20,8 @@ omnitrace_add_interface_library(omnitrace-rocprofiler
|
||||
"Provides flags and libraries for rocprofiler")
|
||||
omnitrace_add_interface_library(omnitrace-rocm-smi
|
||||
"Provides flags and libraries for rocm-smi")
|
||||
omnitrace_add_interface_library(
|
||||
omnitrace-rccl "Provides flags for ROCm Communication Collectives Library (RCCL)")
|
||||
omnitrace_add_interface_library(omnitrace-mpi "Provides MPI or MPI headers")
|
||||
omnitrace_add_interface_library(omnitrace-ptl "Enables PTL support (tasking)")
|
||||
omnitrace_add_interface_library(omnitrace-papi "Enable PAPI support")
|
||||
@@ -37,6 +39,7 @@ set(OMNITRACE_EXTENSION_LIBRARIES
|
||||
omnitrace::omnitrace-roctracer
|
||||
omnitrace::omnitrace-rocprofiler
|
||||
omnitrace::omnitrace-rocm-smi
|
||||
omnitrace::omnitrace-rccl
|
||||
omnitrace::omnitrace-mpi
|
||||
omnitrace::omnitrace-ptl
|
||||
omnitrace::omnitrace-ompt
|
||||
@@ -196,6 +199,17 @@ if(OMNITRACE_USE_ROCM_SMI)
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${rocm-smi_LIBRARY_DIRS}")
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# RCCL
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
if(OMNITRACE_USE_RCCL)
|
||||
find_package(RCCL-Headers ${omnitrace_FIND_QUIETLY} REQUIRED)
|
||||
target_link_libraries(omnitrace-rccl INTERFACE roc::rccl-headers)
|
||||
omnitrace_target_compile_definitions(omnitrace-rccl INTERFACE OMNITRACE_USE_RCCL)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# MPI
|
||||
|
||||
@@ -24,7 +24,7 @@ ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm
|
||||
|
||||
RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
|
||||
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \
|
||||
yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
|
||||
yum install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \
|
||||
yum update -y && \
|
||||
yum clean all
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${A
|
||||
zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \
|
||||
zypper --non-interactive --gpg-auto-import-keys refresh && \
|
||||
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \
|
||||
zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
|
||||
zypper install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \
|
||||
zypper clean --all
|
||||
|
||||
ARG PYTHON_VERSIONS="6 7 8 9 10"
|
||||
|
||||
@@ -28,7 +28,7 @@ RUN apt-get update && \
|
||||
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_REPO_VERSION}/ ${ROCM_REPO_DIST} main" | tee /etc/apt/sources.list.d/rocm.list && \
|
||||
apt-get update && \
|
||||
apt-get dist-upgrade -y && \
|
||||
apt-get install -y rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \
|
||||
apt-get install -y rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \
|
||||
apt-get autoclean
|
||||
|
||||
RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
||||
|
||||
@@ -32,3 +32,4 @@ add_subdirectory(openmp)
|
||||
add_subdirectory(mpi)
|
||||
add_subdirectory(python)
|
||||
add_subdirectory(lulesh)
|
||||
add_subdirectory(rccl)
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
|
||||
project(omnitrace-rccl-example LANGUAGES CXX)
|
||||
|
||||
find_package(rccl)
|
||||
find_package(hip HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR})
|
||||
|
||||
function(rccl_message)
|
||||
if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace")
|
||||
omnitrace_message(${ARGN})
|
||||
else()
|
||||
message(${ARGN})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
if(hip_FOUND AND rccl_FOUND)
|
||||
include(FetchContent)
|
||||
fetchcontent_declare(
|
||||
rccl-tests GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rccl-tests.git)
|
||||
|
||||
# After the following call, the CMake targets defined by googletest and Catch2 will be
|
||||
# available to the rest of the build
|
||||
fetchcontent_makeavailable(rccl-tests)
|
||||
|
||||
get_filename_component(rccl_ROOT_DIR "${rccl_INCLUDE_DIR}" DIRECTORY)
|
||||
|
||||
rccl_message(STATUS "Building rccl-tests...")
|
||||
execute_process(
|
||||
COMMAND make HIP_HOME=${ROCM_PATH} RCCL_HOME=${rccl_ROOT_DIR}
|
||||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src
|
||||
RESULT_VARIABLE _RCCL_BUILD_RET
|
||||
ERROR_VARIABLE _RCCL_BUILD_ERR
|
||||
OUTPUT_VARIABLE _RCCL_BUILD_OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
if(NOT _RCCL_BUILD_RET EQUAL 0)
|
||||
rccl_message(${_RCCL_BUILD_OUT})
|
||||
rccl_message(AUTHOR_WARNING "Failed to build rccl-tests: ${_RCCL_BUILD_ERR}")
|
||||
else()
|
||||
file(GLOB RCCL_TEST_EXECUTABLES
|
||||
${CMAKE_BINARY_DIR}/_deps/rccl-tests-src/build/*_perf)
|
||||
set(_RCCL_TEST_TARGETS)
|
||||
|
||||
foreach(_EXE ${RCCL_TEST_EXECUTABLES})
|
||||
get_filename_component(_EXE_NAME "${_EXE}" NAME)
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${_EXE}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME})
|
||||
add_executable(rccl-tests::${_EXE_NAME} IMPORTED GLOBAL)
|
||||
set_property(
|
||||
TARGET rccl-tests::${_EXE_NAME}
|
||||
PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME})
|
||||
list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}")
|
||||
endforeach()
|
||||
|
||||
set(RCCL_TEST_TARGETS
|
||||
"${_RCCL_TEST_TARGETS}"
|
||||
CACHE INTERNAL "rccl-test targets")
|
||||
endif()
|
||||
else()
|
||||
rccl_message(AUTHOR_WARNING "${PROJECT_NAME} skipped. Missing RCCL and/or HIP...")
|
||||
endif()
|
||||
@@ -40,13 +40,16 @@ endif()
|
||||
|
||||
add_executable(transpose transpose.cpp)
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang"
|
||||
AND NOT CMAKE_CXX_COMPILER_IS_HIPCC
|
||||
AND NOT HIPCC_EXECUTABLE)
|
||||
target_link_libraries(
|
||||
transpose
|
||||
PRIVATE
|
||||
$<IF:$<TARGET_EXISTS:omnitrace::omnitrace-compile-options>,omnitrace::omnitrace-compile-options,>
|
||||
$<IF:$<TARGET_EXISTS:hip::host>,hip::host,>
|
||||
$<IF:$<TARGET_EXISTS:hip::device>,hip::device,>)
|
||||
PRIVATE $<TARGET_NAME_IF_EXISTS:omnitrace::omnitrace-compile-options>
|
||||
$<TARGET_NAME_IF_EXISTS:hip::host> $<TARGET_NAME_IF_EXISTS:hip::device>)
|
||||
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
target_link_libraries(
|
||||
transpose PRIVATE $<TARGET_NAME_IF_EXISTS:omnitrace::omnitrace-compile-options>)
|
||||
else()
|
||||
target_compile_options(transpose PRIVATE -W -Wall)
|
||||
endif()
|
||||
|
||||
@@ -52,7 +52,13 @@ get_whole_function_names()
|
||||
"rocr::core::BusyWaitSignal::WaitAcquire",
|
||||
"rocr::core::BusyWaitSignal::WaitRelaxed", "rocr::HSA::hsa_signal_wait_scacquire",
|
||||
"rocr::os::ThreadTrampoline", "rocr::image::ImageRuntime::CreateImageManager",
|
||||
"rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", "event_base_loop"
|
||||
"rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info",
|
||||
"event_base_loop", "bootstrapRoot", "bootstrapNetAccept", "ncclCommInitRank",
|
||||
"ncclCommInitAll", "ncclCommDestroy", "ncclCommCount", "ncclCommCuDevice",
|
||||
"ncclCommUserRank", "ncclReduce", "ncclBcast", "ncclBroadcast", "ncclAllReduce",
|
||||
"ncclReduceScatter", "ncclAllGather", "ncclGroupStart", "ncclGroupEnd",
|
||||
"ncclSend", "ncclRecv", "ncclGather", "ncclScatter", "ncclAllToAll",
|
||||
"ncclAllToAllv"
|
||||
};
|
||||
#else
|
||||
// should hopefully be removed soon
|
||||
|
||||
@@ -429,6 +429,7 @@ module_function::is_routine_constrained() const
|
||||
|
||||
static std::regex exclude(
|
||||
"(omnitrace|tim::|N3tim|MPI_Init|MPI_Finalize|dyninst|tm_clones)", regex_opts);
|
||||
static std::regex exclude_printf("(|v|f)printf$", regex_opts);
|
||||
static std::regex exclude_cxx(
|
||||
"(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|"
|
||||
"S)_|::basic_string[a-zA-Z,<>: ]+::_M_create|::__|::_(Alloc|State)|"
|
||||
@@ -456,6 +457,11 @@ module_function::is_routine_constrained() const
|
||||
return _report("Excluding", "critical", 3);
|
||||
}
|
||||
|
||||
if(std::regex_search(function_name, exclude_printf))
|
||||
{
|
||||
return _report("Excluding", "critical-printf", 3);
|
||||
}
|
||||
|
||||
if(whole.count(function_name) > 0)
|
||||
{
|
||||
return _report("Excluding", "critical-whole-match", 3);
|
||||
|
||||
@@ -34,6 +34,7 @@ target_link_libraries(
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-roctracer>
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-rocprofiler>
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-rocm-smi>
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-rccl>
|
||||
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_LTO}>,omnitrace::omnitrace-lto,>>
|
||||
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBGCC}>,omnitrace::omnitrace-static-libgcc,>>
|
||||
$<BUILD_INTERFACE:$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBSTDCXX}>,omnitrace::omnitrace-static-libstdcxx,>>
|
||||
@@ -100,6 +101,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/process_sampler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/rocm.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/rocprofiler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/roctracer.hpp
|
||||
@@ -118,6 +120,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/rocprofiler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp
|
||||
@@ -143,6 +146,13 @@ if(OMNITRACE_USE_ROCTRACER)
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/roctracer.cpp)
|
||||
endif()
|
||||
|
||||
if(OMNITRACE_USE_RCCL)
|
||||
target_sources(
|
||||
omnitrace-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp)
|
||||
endif()
|
||||
|
||||
if(OMNITRACE_USE_ROCPROFILER)
|
||||
target_sources(
|
||||
omnitrace-object-library
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "library/ompt.hpp"
|
||||
#include "library/process_sampler.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
#include "library/rcclp.hpp"
|
||||
#include "library/rocprofiler.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
@@ -647,6 +648,12 @@ omnitrace_init_tooling_hidden()
|
||||
ompt::setup();
|
||||
}
|
||||
|
||||
if(get_use_rcclp())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Setting up RCCLP...\n");
|
||||
rcclp::setup();
|
||||
}
|
||||
|
||||
if(get_use_perfetto() && !is_system_backend())
|
||||
{
|
||||
#if defined(CUSTOM_DATA_SOURCE)
|
||||
@@ -840,6 +847,12 @@ omnitrace_finalize_hidden(void)
|
||||
}
|
||||
}
|
||||
|
||||
if(get_use_rcclp())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down RCCLP...\n");
|
||||
rcclp::shutdown();
|
||||
}
|
||||
|
||||
if(get_use_ompt())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n");
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
TIMEMORY_DEFINE_NS_API(api, omnitrace)
|
||||
TIMEMORY_DEFINE_NS_API(api, sampling)
|
||||
TIMEMORY_DEFINE_NS_API(api, rocm_smi)
|
||||
TIMEMORY_DEFINE_NS_API(api, rccl)
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "library/common.hpp"
|
||||
#include "library/defines.hpp"
|
||||
|
||||
#include <timemory/api.hpp>
|
||||
@@ -32,6 +33,7 @@
|
||||
#include <timemory/enum.h>
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/mpl/type_traits.hpp>
|
||||
#include <timemory/mpl/types.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
@@ -40,6 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling)
|
||||
|
||||
TIMEMORY_DECLARE_COMPONENT(roctracer)
|
||||
TIMEMORY_DECLARE_COMPONENT(rocprofiler)
|
||||
TIMEMORY_DECLARE_COMPONENT(rccl_comm_data)
|
||||
TIMEMORY_DECLARE_COMPONENT(rcclp_handle)
|
||||
TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl)
|
||||
TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker<float, rccl_api_t>)
|
||||
|
||||
/// \struct tim::trait::name
|
||||
/// \brief provides a constexpr string in ::value
|
||||
@@ -74,6 +80,7 @@ TIMEMORY_DEFINE_NS_API(category, pthread)
|
||||
TIMEMORY_DEFINE_NS_API(category, kokkos)
|
||||
TIMEMORY_DEFINE_NS_API(category, mpi)
|
||||
TIMEMORY_DEFINE_NS_API(category, ompt)
|
||||
TIMEMORY_DEFINE_NS_API(category, rccl)
|
||||
TIMEMORY_DEFINE_NS_API(category, critical_trace)
|
||||
TIMEMORY_DEFINE_NS_API(category, host_critical_trace)
|
||||
TIMEMORY_DEFINE_NS_API(category, device_critical_trace)
|
||||
@@ -93,6 +100,7 @@ TIMEMORY_DEFINE_NAME_TRAIT("pthread", category::pthread);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("kokkos", category::kokkos);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("mpi", category::mpi);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("ompt", category::ompt);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("rccl", category::rccl);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("critical-trace", category::critical_trace);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("host-critical-trace", category::host_critical_trace);
|
||||
TIMEMORY_DEFINE_NAME_TRAIT("device-critical-trace", category::device_critical_trace);
|
||||
@@ -150,6 +158,13 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(OMNITRACE_USE_RCCL)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(TIMEMORY_USE_LIBUNWIND)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type)
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/rcclp.hpp"
|
||||
#include "library/rcclp.hpp"
|
||||
|
||||
#include <timemory/manager.hpp>
|
||||
|
||||
std::ostream&
|
||||
operator<<(std::ostream& _os, const ncclUniqueId& _v)
|
||||
{
|
||||
for(auto itr : _v.internal)
|
||||
_os << itr;
|
||||
return _os;
|
||||
}
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
uint64_t
|
||||
activate_rcclp()
|
||||
{
|
||||
using handle_t = tim::component::rcclp_handle;
|
||||
|
||||
static auto _handle = std::shared_ptr<handle_t>{};
|
||||
|
||||
if(!_handle.get())
|
||||
{
|
||||
_handle = std::make_shared<handle_t>();
|
||||
_handle->start();
|
||||
|
||||
auto cleanup_functor = [=]() {
|
||||
if(_handle)
|
||||
{
|
||||
_handle->stop();
|
||||
_handle.reset();
|
||||
}
|
||||
};
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
|
||||
<< demangle<api::rccl>();
|
||||
tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
//======================================================================================//
|
||||
//
|
||||
uint64_t
|
||||
deactivate_rcclp(uint64_t id)
|
||||
{
|
||||
if(id > 0)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
|
||||
<< demangle<api::rccl>();
|
||||
tim::manager::instance()->cleanup(ss.str());
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
//
|
||||
//======================================================================================//
|
||||
//
|
||||
void
|
||||
configure_rcclp(const std::set<std::string>& permit, const std::set<std::string>& reject)
|
||||
{
|
||||
static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS;
|
||||
|
||||
using rcclp_gotcha_t =
|
||||
tim::component::gotcha<rcclp_wrapper_count, rccl_toolset_t, api::rccl>;
|
||||
|
||||
static bool is_initialized = false;
|
||||
if(!is_initialized)
|
||||
{
|
||||
// generate the gotcha wrappers
|
||||
rcclp_gotcha_t::get_initializer() = []() {
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy);
|
||||
};
|
||||
|
||||
// provide environment variable for suppressing wrappers
|
||||
rcclp_gotcha_t::get_reject_list() = [reject]() {
|
||||
auto _reject = reject;
|
||||
// check environment
|
||||
auto reject_list =
|
||||
tim::get_env<std::string>("OMNITRACE_RCCLP_REJECT_LIST", "");
|
||||
// add environment setting
|
||||
for(const auto& itr : tim::delimit(reject_list))
|
||||
_reject.insert(itr);
|
||||
return _reject;
|
||||
};
|
||||
|
||||
// provide environment variable for selecting wrappers
|
||||
rcclp_gotcha_t::get_permit_list() = [permit]() {
|
||||
auto _permit = permit;
|
||||
// check environment
|
||||
auto permit_list =
|
||||
tim::get_env<std::string>("OMNITRACE_RCCLP_PERMIT_LIST", "");
|
||||
// add environment setting
|
||||
for(const auto& itr : tim::delimit(permit_list))
|
||||
_permit.insert(itr);
|
||||
return _permit;
|
||||
};
|
||||
|
||||
is_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rcclp_handle::start()
|
||||
{
|
||||
if(get_tool_count()++ == 0)
|
||||
{
|
||||
get_tool_instance() = std::make_shared<rcclp_tuple_t>("timemory_rcclp");
|
||||
get_tool_instance()->start();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rcclp_handle::stop()
|
||||
{
|
||||
auto idx = --get_tool_count();
|
||||
if(get_tool_instance().get())
|
||||
{
|
||||
get_tool_instance()->stop();
|
||||
if(idx == 0) get_tool_instance().reset();
|
||||
}
|
||||
}
|
||||
|
||||
rcclp_handle::persistent_data&
|
||||
rcclp_handle::get_persistent_data()
|
||||
{
|
||||
static persistent_data _instance;
|
||||
return _instance;
|
||||
}
|
||||
|
||||
std::atomic<short>&
|
||||
rcclp_handle::get_configured()
|
||||
{
|
||||
return get_persistent_data().m_configured;
|
||||
}
|
||||
|
||||
rcclp_handle::toolset_ptr_t&
|
||||
rcclp_handle::get_tool_instance()
|
||||
{
|
||||
return get_persistent_data().m_tool;
|
||||
}
|
||||
|
||||
std::atomic<int64_t>&
|
||||
rcclp_handle::get_tool_count()
|
||||
{
|
||||
return get_persistent_data().m_count;
|
||||
}
|
||||
|
||||
void
|
||||
rccl_comm_data::preinit()
|
||||
{
|
||||
omnitrace::rcclp::configure();
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclSend
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer));
|
||||
}
|
||||
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclBroadcast
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size);
|
||||
}
|
||||
|
||||
// ncclAllGather
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size);
|
||||
}
|
||||
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t)
|
||||
@@ -0,0 +1,220 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/common.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <timemory/api/macros.hpp>
|
||||
#include <timemory/components/macros.hpp>
|
||||
|
||||
#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
|
||||
# include <rccl/rccl.h>
|
||||
#else
|
||||
# include <rccl.h>
|
||||
#endif
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#if !defined(OMNITRACE_NUM_RCCLP_WRAPPERS)
|
||||
# define OMNITRACE_NUM_RCCLP_WRAPPERS 25
|
||||
#endif
|
||||
|
||||
TIMEMORY_COMPONENT_ALIAS(
|
||||
rccl_toolset_t,
|
||||
component_bundle<rccl_api_t, omnitrace::component::category_region<category::rccl>,
|
||||
rccl_comm_data*>)
|
||||
TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
|
||||
gotcha<OMNITRACE_NUM_RCCLP_WRAPPERS, rccl_toolset_t, rccl_api_t>)
|
||||
|
||||
#if !defined(OMNITRACE_USE_RCCL)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
|
||||
#endif
|
||||
|
||||
TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t,
|
||||
true_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t,
|
||||
true_type)
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
uint64_t
|
||||
activate_rcclp();
|
||||
|
||||
uint64_t
|
||||
deactivate_rcclp(uint64_t id);
|
||||
|
||||
void
|
||||
configure_rcclp(const std::set<std::string>& permit = {},
|
||||
const std::set<std::string>& reject = {});
|
||||
|
||||
struct rcclp_handle : base<rcclp_handle, void>
|
||||
{
|
||||
static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS;
|
||||
|
||||
using value_type = void;
|
||||
using this_type = rcclp_handle;
|
||||
using base_type = base<this_type, value_type>;
|
||||
|
||||
using rcclp_tuple_t = tim::component_tuple<rcclp_gotcha_t>;
|
||||
using toolset_ptr_t = std::shared_ptr<rcclp_tuple_t>;
|
||||
|
||||
static std::string label() { return "rcclp_handle"; }
|
||||
static std::string description() { return "Handle for activating NCCL wrappers"; }
|
||||
static void get() {}
|
||||
static void start();
|
||||
static void stop();
|
||||
static int get_count() { return get_tool_count().load(); }
|
||||
|
||||
private:
|
||||
struct persistent_data
|
||||
{
|
||||
std::atomic<short> m_configured{ 0 };
|
||||
std::atomic<int64_t> m_count{ 0 };
|
||||
toolset_ptr_t m_tool = toolset_ptr_t{};
|
||||
};
|
||||
|
||||
static persistent_data& get_persistent_data();
|
||||
static std::atomic<short>& get_configured();
|
||||
static toolset_ptr_t& get_tool_instance();
|
||||
static std::atomic<int64_t>& get_tool_count();
|
||||
};
|
||||
|
||||
struct rccl_comm_data : base<rccl_comm_data, void>
|
||||
{
|
||||
using value_type = void;
|
||||
using this_type = rccl_comm_data;
|
||||
using base_type = base<this_type, value_type>;
|
||||
using tracker_t = tim::auto_tuple<rccl_data_tracker_t>;
|
||||
using data_type = float;
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(rccl_comm_data)
|
||||
|
||||
static void preinit();
|
||||
static void start() {}
|
||||
static void stop() {}
|
||||
|
||||
static auto rccl_type_size(ncclDataType_t datatype)
|
||||
{
|
||||
switch(datatype)
|
||||
{
|
||||
case ncclInt8:
|
||||
case ncclUint8: return 1;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32: return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64: return 8;
|
||||
default: return 0;
|
||||
};
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclSend
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclBroadcast
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllGather
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
|
||||
|
||||
private:
|
||||
template <typename... Args>
|
||||
static void add(tracker_t& _t, data_type value, Args&&... args)
|
||||
{
|
||||
_t.store(std::plus<data_type>{}, value);
|
||||
TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward<Args>(args), value));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add(const gotcha_data& _data, data_type value, Args&&... args)
|
||||
{
|
||||
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
|
||||
add(_t, value, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value,
|
||||
Args&&... args)
|
||||
{
|
||||
// if(tim::settings::add_secondary())
|
||||
{
|
||||
tracker_t _s{ std::string_view{ _data.tool_id.c_str() } };
|
||||
add(_s, _data, value, std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add(std::string_view _name, data_type value, Args&&... args)
|
||||
{
|
||||
tracker_t _t{ _name };
|
||||
add(_t, value, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add_secondary(tracker_t&, std::string_view _name, data_type value,
|
||||
Args&&... args)
|
||||
{
|
||||
// if(tim::settings::add_secondary())
|
||||
{
|
||||
tracker_t _s{ _name };
|
||||
add(_s, value, std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/gpu.hpp"
|
||||
#include "library/mproc.hpp"
|
||||
#include "library/perfetto.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
|
||||
@@ -46,9 +47,11 @@
|
||||
#include <csignal>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -270,6 +273,11 @@ configure_settings(bool _init)
|
||||
"Enable support for Kokkos Tools", false, "kokkos",
|
||||
"backend");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_USE_RCCLP",
|
||||
"Enable support for ROCm Communication Collectives Library (RCCL) Performance",
|
||||
false, "rocm", "rccl", "backend");
|
||||
|
||||
OMNITRACE_CONFIG_CL_SETTING(
|
||||
bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false,
|
||||
"--omnitrace-kokkos-kernel-logger", "kokkos", "debugging");
|
||||
@@ -582,12 +590,30 @@ configure_settings(bool _init)
|
||||
}
|
||||
if(!_found_sep && _cmd.size() > 1) _cmd.insert(_cmd.begin() + 1, "--");
|
||||
|
||||
auto _pid = getpid();
|
||||
auto _ppid = getppid();
|
||||
auto _proc = mproc::get_concurrent_processes(_ppid);
|
||||
bool _main_proc = (_proc.size() < 2 || *_proc.begin() == _pid);
|
||||
|
||||
for(auto&& itr :
|
||||
tim::delimit(_config->get<std::string>("OMNITRACE_CONFIG_FILE"), ";:"))
|
||||
{
|
||||
if(_config->get_suppress_config()) continue;
|
||||
OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str());
|
||||
_config->read(itr);
|
||||
if(_config->get<bool>("OMNITRACE_CI") && _main_proc)
|
||||
{
|
||||
std::ifstream _in{ itr };
|
||||
std::stringstream _iss{};
|
||||
while(_in)
|
||||
{
|
||||
std::string _s{};
|
||||
getline(_in, _s);
|
||||
_iss << _s << "\n";
|
||||
}
|
||||
OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(),
|
||||
_iss.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
settings::suppress_config() = true;
|
||||
@@ -666,6 +692,7 @@ configure_mode_settings()
|
||||
_set("OMNITRACE_USE_ROCTRACER", false);
|
||||
_set("OMNITRACE_USE_ROCPROFILER", false);
|
||||
_set("OMNITRACE_USE_KOKKOSP", false);
|
||||
_set("OMNITRACE_USE_RCCLP", false);
|
||||
_set("OMNITRACE_USE_OMPT", false);
|
||||
_set("OMNITRACE_USE_SAMPLING", false);
|
||||
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
|
||||
@@ -721,6 +748,7 @@ configure_mode_settings()
|
||||
_set("OMNITRACE_USE_ROCTRACER", false);
|
||||
_set("OMNITRACE_USE_ROCPROFILER", false);
|
||||
_set("OMNITRACE_USE_KOKKOSP", false);
|
||||
_set("OMNITRACE_USE_RCCLP", false);
|
||||
_set("OMNITRACE_USE_OMPT", false);
|
||||
_set("OMNITRACE_USE_SAMPLING", false);
|
||||
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
|
||||
@@ -817,6 +845,7 @@ configure_disabled_settings()
|
||||
_handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto");
|
||||
_handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory");
|
||||
_handle_use_option("OMNITRACE_USE_OMPT", "ompt");
|
||||
_handle_use_option("OMNITRACE_USE_RCCLP", "rcclp");
|
||||
_handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi");
|
||||
_handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer");
|
||||
_handle_use_option("OMNITRACE_USE_ROCPROFILER", "rocprofiler");
|
||||
@@ -1355,6 +1384,13 @@ get_use_code_coverage()
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_use_rcclp()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_USE_RCCLP");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_critical_trace_debug()
|
||||
{
|
||||
|
||||
@@ -216,6 +216,9 @@ get_use_sampling_cputime();
|
||||
int
|
||||
get_sampling_rtoffset();
|
||||
|
||||
bool
|
||||
get_use_rcclp();
|
||||
|
||||
bool
|
||||
get_timeline_sampling();
|
||||
|
||||
|
||||
@@ -83,6 +83,8 @@
|
||||
perfetto::Category("kokkos").SetDescription("Kokkos regions"), \
|
||||
perfetto::Category("mpi").SetDescription("MPI regions"), \
|
||||
perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \
|
||||
perfetto::Category("rccl").SetDescription( \
|
||||
"ROCm Communication Collectives Library (RCCL) regions"), \
|
||||
perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \
|
||||
perfetto::Category("host-critical-trace") \
|
||||
.SetDescription("Host-side critical traces"), \
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/rcclp.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <timemory/timemory.hpp>
|
||||
|
||||
#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
|
||||
# include <rccl/rccl.h>
|
||||
#else
|
||||
# include <rccl.h>
|
||||
#endif
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
static uint64_t global_id = std::numeric_limits<uint64_t>::max();
|
||||
static void* librccl_handle = nullptr;
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace rcclp
|
||||
{
|
||||
void
|
||||
configure()
|
||||
{
|
||||
comp::rccl_data_tracker_t::label() = "rccl_comm_data";
|
||||
comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data";
|
||||
}
|
||||
|
||||
void
|
||||
setup()
|
||||
{
|
||||
configure();
|
||||
|
||||
// make sure the symbols are loaded to be wrapped
|
||||
auto libpath = tim::get_env<std::string>("OMNITRACE_RCCL_LIBRARY", "librccl.so");
|
||||
librccl_handle = dlopen(libpath.c_str(), RTLD_NOW | RTLD_GLOBAL);
|
||||
if(!librccl_handle) fprintf(stderr, "%s\n", dlerror());
|
||||
dlerror(); // Clear any existing error
|
||||
|
||||
auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true);
|
||||
if(_data)
|
||||
comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) {
|
||||
cb.initialize<comp::rccl_comm_data>();
|
||||
};
|
||||
|
||||
comp::configure_rcclp();
|
||||
global_id = comp::activate_rcclp();
|
||||
if(librccl_handle) dlclose(librccl_handle);
|
||||
}
|
||||
|
||||
void
|
||||
shutdown()
|
||||
{
|
||||
if(global_id < std::numeric_limits<uint64_t>::max())
|
||||
comp::deactivate_rcclp(global_id);
|
||||
}
|
||||
} // namespace rcclp
|
||||
} // namespace omnitrace
|
||||
@@ -0,0 +1,57 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "library/defines.hpp"
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace rcclp
|
||||
{
|
||||
void
|
||||
configure();
|
||||
|
||||
void
|
||||
setup();
|
||||
|
||||
void
|
||||
shutdown();
|
||||
|
||||
#if !defined(OMNITRACE_USE_RCCL) || \
|
||||
(defined(OMNITRACE_USE_RCCL) && OMNITRACE_USE_RCCL == 0)
|
||||
inline void
|
||||
configure()
|
||||
{}
|
||||
|
||||
inline void
|
||||
setup()
|
||||
{}
|
||||
|
||||
inline void
|
||||
shutdown()
|
||||
{}
|
||||
#endif
|
||||
} // namespace rcclp
|
||||
} // namespace omnitrace
|
||||
@@ -0,0 +1,522 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_H_
|
||||
#define NCCL_H_
|
||||
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 11
|
||||
#define NCCL_PATCH 4
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 21104
|
||||
#define NCCL_VERSION(X, Y, Z) \
|
||||
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/*! @brief Opaque handle to communicator */
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
typedef struct
|
||||
{
|
||||
char internal[NCCL_UNIQUE_ID_BYTES];
|
||||
} ncclUniqueId;
|
||||
|
||||
/*! @brief Error type */
|
||||
typedef enum
|
||||
{
|
||||
ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6
|
||||
} ncclResult_t;
|
||||
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
* @details This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int* version);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetVersion(int* version);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Generates an ID for ncclCommInitRank
|
||||
|
||||
@details
|
||||
Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
called once and the Id should be distributed to all ranks in the
|
||||
communicator before calling ncclCommInitRank.
|
||||
|
||||
@param[in]
|
||||
uniqueId ncclUniqueId*
|
||||
pointer to uniqueId
|
||||
|
||||
*/
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version).
|
||||
|
||||
@details
|
||||
rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
Each rank is associated to a CUDA device, which has to be set before calling
|
||||
ncclCommInitRank.
|
||||
ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
|
||||
|
||||
@param[in]
|
||||
comm ncclComm_t*
|
||||
communicator struct pointer
|
||||
*/
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
|
||||
int rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
|
||||
int rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a clique of communicators (single process version).
|
||||
*
|
||||
* @details This is a convenience function to create a single-process communicator
|
||||
* clique. Returns an array of ndev newly initialized communicators in comm. comm
|
||||
* should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is
|
||||
* NULL, the first ndev HIP devices are used. Order of devlist defines user-order of
|
||||
* processors within the communicator.
|
||||
* */
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object, but waits for any
|
||||
* operations that might still be running on the device */
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object and aborts any
|
||||
* operations that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns a human-readable error message. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gets the number of ranks in the communicator clique. */
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the rocm device number associated with the communicator. */
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Reduction operation selector */
|
||||
/* Reduction operation selector */
|
||||
typedef enum
|
||||
{
|
||||
ncclNumOps_dummy = 5
|
||||
} ncclRedOp_dummy_t;
|
||||
typedef enum
|
||||
{
|
||||
ncclSum = 0,
|
||||
ncclProd = 1,
|
||||
ncclMax = 2,
|
||||
ncclMin = 3,
|
||||
ncclAvg = 4,
|
||||
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
|
||||
* serves as the least possible value for dynamic ncclRedOp_t's
|
||||
* as constructed by ncclRedOpCreate*** functions. */
|
||||
ncclNumOps = 5,
|
||||
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
|
||||
* It is defined to be the largest signed value (since compilers
|
||||
* are permitted to use signed enums) that won't grow
|
||||
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
|
||||
* maintain ABI compatibility. */
|
||||
ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t))
|
||||
} ncclRedOp_t;
|
||||
|
||||
/*! @brief Data types */
|
||||
typedef enum
|
||||
{
|
||||
ncclInt8 = 0,
|
||||
ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2,
|
||||
ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6,
|
||||
ncclHalf = 6,
|
||||
ncclFloat32 = 7,
|
||||
ncclFloat = 7,
|
||||
ncclFloat64 = 8,
|
||||
ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10
|
||||
} ncclDataType_t;
|
||||
|
||||
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
typedef enum
|
||||
{
|
||||
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||
* dereferenced while the collective is running. */
|
||||
ncclScalarDevice = 0,
|
||||
|
||||
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
|
||||
* dereferenced before the ncclRedOpCreate***() function returns. */
|
||||
ncclScalarHostImmediate = 1
|
||||
} ncclScalarResidence_t;
|
||||
|
||||
/*
|
||||
* ncclRedOpCreatePreMulSum
|
||||
*
|
||||
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||
* scalar locally before reducing them with peer values via summation. For use
|
||||
* only with collectives launched against *comm* and *datatype*. The
|
||||
* *residence* argument indicates how/when the memory pointed to by *scalar*
|
||||
* will be dereferenced. Upon return, the newly created operator's handle
|
||||
* is stored in *op*.
|
||||
*/
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence,
|
||||
ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence,
|
||||
ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* ncclRedOpDestroy
|
||||
*
|
||||
* Destroys the reduction operator *op*. The operator must have been created by
|
||||
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||
*/
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
*
|
||||
* Collective communication operations must be called separately for each
|
||||
* communicator in a communicator clique.
|
||||
*
|
||||
* They return when operations have been enqueued on the CUDA stream.
|
||||
*
|
||||
* Since they may perform inter-CPU synchronization, each call has to be done
|
||||
* from a different thread or process, or need to use Group Semantics (see
|
||||
* below).
|
||||
*/
|
||||
|
||||
/*!
|
||||
* @brief Reduce
|
||||
*
|
||||
* @details Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||
* operation.
|
||||
* recvbuff may be NULL on all calls except for root device.
|
||||
* root is the rank (not the CUDA device) where data will reside after the
|
||||
* operation is complete.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief (deprecated) Broadcast (in-place)
|
||||
*
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* This operation is implicitely in place.
|
||||
*/
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Broadcast
|
||||
*
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the HIP device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-Reduce
|
||||
*
|
||||
* @details Reduces data arrays of length count in sendbuff using op operation, and
|
||||
* leaves identical copies of result on each recvbuff.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*!
|
||||
* @brief Reduce-Scatter
|
||||
*
|
||||
* @details Reduces data in sendbuff using op operation and leaves reduced result
|
||||
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||
* block of the result.
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-Gather
|
||||
*
|
||||
* @details Each device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Send
|
||||
*
|
||||
* @details Send data from sendbuff to rank peer.
|
||||
* Rank peer needs to call ncclRecv with the same datatype and the same count from
|
||||
* this rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
|
||||
* operations need to progress concurrently to complete, they must be fused within a
|
||||
* ncclGroupStart/ ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Receive
|
||||
*
|
||||
* @details Receive data from rank peer into recvbuff.
|
||||
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
|
||||
* operations need to progress concurrently to complete, they must be fused within a
|
||||
* ncclGroupStart/ ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gather
|
||||
*
|
||||
* @details Root device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
*
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Scatter
|
||||
*
|
||||
* @details Scattered over the devices so that recvbuff on rank i will contain the
|
||||
* i-th block of the data on root.
|
||||
*
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-All
|
||||
*
|
||||
* @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
|
||||
* block. Each block for sending/receiving has count elements, which means
|
||||
* that recvbuff and sendbuff should have a size of nranks*count elements.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-Allv
|
||||
*
|
||||
* @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
|
||||
* to device (j). In the same time, device (i) receives recvcounts[j] of data
|
||||
* from device (j) to be placed at rdispls[j].
|
||||
|
||||
* sendcounts, sdispls, recvcounts and rdispls are all measured in the units
|
||||
* of datatype, not bytes.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff,
|
||||
const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff,
|
||||
const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Group semantics
|
||||
*
|
||||
* When managing multiple GPUs from a single thread, and since NCCL collective
|
||||
* calls may perform inter-CPU synchronization, we need to "group" calls for
|
||||
* different ranks/devices into a single call.
|
||||
*
|
||||
* Grouping NCCL calls as being part of the same collective operation is done
|
||||
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
|
||||
* collective calls until the ncclGroupEnd call, which will wait for all calls
|
||||
* to be complete. Note that for collective communication, ncclGroupEnd only
|
||||
* guarantees that the operations are enqueued on the streams, not that
|
||||
* the operation is effectively done.
|
||||
*
|
||||
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||
* of ncclGroupStart/ncclGroupEnd, but not together.
|
||||
*
|
||||
* Group semantics also allow to fuse multiple operations on the same device
|
||||
* to improve performance (for aggregated collective calls), or to permit
|
||||
* concurrent progress of multiple send/receive operations.
|
||||
*/
|
||||
|
||||
/*! @brief Group Start
|
||||
*
|
||||
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
||||
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
||||
* ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupStart();
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Group End
|
||||
*
|
||||
* End a group call. Start a fused NCCL operation consisting of all calls since
|
||||
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
||||
* need to be called after ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupEnd();
|
||||
/// @endcond
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
#endif // end include guard
|
||||
@@ -12,6 +12,11 @@ if(NOT DEFINED NUM_PROCS)
|
||||
set(NUM_PROCS 2)
|
||||
endif()
|
||||
|
||||
math(EXPR NUM_SAMPLING_PROCS "${NUM_PROCS_REAL}-1")
|
||||
if(NUM_SAMPLING_PROCS GREATER 3)
|
||||
set(NUM_SAMPLING_PROCS 3)
|
||||
endif()
|
||||
|
||||
math(EXPR NUM_THREADS "${NUM_PROCS_REAL} + (${NUM_PROCS_REAL} / 2)")
|
||||
if(NUM_THREADS GREATER 12)
|
||||
set(NUM_THREADS 12)
|
||||
@@ -30,8 +35,8 @@ set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREAD
|
||||
|
||||
set(_base_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_FILE_OUTPUT=ON" "${_test_openmp_env}" "${_test_library_path}")
|
||||
|
||||
set(_flat_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON"
|
||||
@@ -43,11 +48,8 @@ set(_flat_environment
|
||||
"OMNITRACE_COLLAPSE_PROCESSES=ON"
|
||||
"OMNITRACE_COLLAPSE_THREADS=ON"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
|
||||
"OMP_PROC_BIND=spread"
|
||||
"OMP_PLACES=threads"
|
||||
"OMP_NUM_THREADS=2"
|
||||
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
|
||||
)
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_lock_environment
|
||||
"OMNITRACE_USE_SAMPLING=OFF"
|
||||
@@ -62,18 +64,34 @@ set(_lock_environment
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_ompt_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_USE_OMPT=ON" "OMNITRACE_CRITICAL_TRACE=OFF" "${_test_openmp_env}"
|
||||
"OMNITRACE_USE_PERFETTO=ON"
|
||||
"OMNITRACE_USE_TIMEMORY=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_USE_OMPT=ON"
|
||||
"OMNITRACE_CRITICAL_TRACE=OFF"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_perfetto_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=OFF" "OMNITRACE_USE_SAMPLING=ON"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
|
||||
"OMNITRACE_USE_PERFETTO=ON"
|
||||
"OMNITRACE_USE_TIMEMORY=OFF"
|
||||
"OMNITRACE_USE_SAMPLING=ON"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_PERFETTO_BACKEND=inprocess"
|
||||
"OMNITRACE_PERFETTO_FILL_POLICY=ring_buffer"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_timemory_environment
|
||||
"OMNITRACE_USE_PERFETTO=OFF" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}"
|
||||
"OMNITRACE_USE_PERFETTO=OFF"
|
||||
"OMNITRACE_USE_TIMEMORY=ON"
|
||||
"OMNITRACE_USE_SAMPLING=ON"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_test_environment ${_base_environment} "OMNITRACE_CRITICAL_TRACE=OFF")
|
||||
@@ -86,7 +104,7 @@ set(_python_environment
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_TREE_OUTPUT=OFF"
|
||||
"OMNITRACE_USE_PID=OFF"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=trip_count"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
|
||||
"${_test_library_path}"
|
||||
"PYTHONPATH=${PROJECT_BINARY_DIR}/lib/python/site-packages")
|
||||
|
||||
@@ -100,6 +118,18 @@ set(_attach_environment
|
||||
"OMNITRACE_USE_KOKKOSP=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_USE_PID=OFF"
|
||||
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
set(_rccl_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON"
|
||||
"OMNITRACE_USE_TIMEMORY=ON"
|
||||
"OMNITRACE_USE_SAMPLING=OFF"
|
||||
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
|
||||
"OMNITRACE_USE_RCCLP=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_USE_PID=OFF"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
@@ -123,6 +153,83 @@ endif()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
set(_VALID_GPU OFF)
|
||||
if(OMNITRACE_USE_HIP)
|
||||
set(_VALID_GPU ON)
|
||||
find_program(
|
||||
OMNITRACE_ROCM_SMI_EXE
|
||||
NAMES rocm-smi
|
||||
HINTS ${ROCmVersion_DIR}
|
||||
PATHS ${ROCmVersion_DIR}
|
||||
PATH_SUFFIXES bin)
|
||||
if(OMNITRACE_ROCM_SMI_EXE)
|
||||
execute_process(
|
||||
COMMAND ${OMNITRACE_ROCM_SMI_EXE}
|
||||
OUTPUT_VARIABLE _RSMI_OUT
|
||||
ERROR_VARIABLE _RSMI_ERR
|
||||
RESULT_VARIABLE _RSMI_RET)
|
||||
if(_RSMI_RET EQUAL 0)
|
||||
if("${_RSMI_OUTPUT}" MATCHES "ERROR" OR "${_RSMI_ERR}" MATCHES "ERROR")
|
||||
set(_VALID_GPU OFF)
|
||||
endif()
|
||||
else()
|
||||
set(_VALID_GPU OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT _VALID_GPU)
|
||||
omnitrace_message(AUTHOR_WARNING
|
||||
"rocm-smi did not successfully run. Disabling GPU tests...")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(LULESH_USE_GPU ${LULESH_USE_HIP})
|
||||
if(LULESH_USE_CUDA)
|
||||
set(LULESH_USE_GPU ON)
|
||||
endif()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV)
|
||||
set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS)=")
|
||||
set(_FILE_CONTENTS)
|
||||
set(_ENV_CONTENTS)
|
||||
|
||||
foreach(_VAL ${${_ENV}})
|
||||
if("${_VAL}" MATCHES "^OMNITRACE_" AND NOT "${_VAL}" MATCHES "${_ENV_ONLY}")
|
||||
set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n")
|
||||
else()
|
||||
list(APPEND _ENV_CONTENTS "${_VAL}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitrace-tests-config/${_FILE})
|
||||
file(
|
||||
WRITE ${_CONFIG_FILE}
|
||||
"# auto-generated by cmake
|
||||
|
||||
# default values
|
||||
OMNITRACE_CI = ON
|
||||
OMNITRACE_VERBOSE = 1
|
||||
OMNITRACE_DL_VERBOSE = 1
|
||||
OMNITRACE_SAMPLING_FREQ = 50
|
||||
OMNITRACE_SAMPLING_DELAY = 0.05
|
||||
OMNITRACE_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS}
|
||||
OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES
|
||||
OMNITRACE_ROCTRACER_HSA_API = ON
|
||||
OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON
|
||||
|
||||
# test-specific values
|
||||
${_FILE_CONTENTS}
|
||||
")
|
||||
list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}")
|
||||
list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1")
|
||||
set(${_ENV}
|
||||
"${_ENV_CONTENTS}"
|
||||
PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
function(OMNITRACE_ADD_TEST)
|
||||
foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN)
|
||||
foreach(_TYPE PASS FAIL SKIP)
|
||||
@@ -135,10 +242,17 @@ function(OMNITRACE_ADD_TEST)
|
||||
cmake_parse_arguments(
|
||||
TEST
|
||||
"SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options
|
||||
"NAME;TARGET;MPI;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value args
|
||||
"NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value
|
||||
# args
|
||||
"${_KWARGS}" # multiple value args
|
||||
${ARGN})
|
||||
|
||||
if(TEST_GPU AND NOT _VALID_GPU)
|
||||
omnitrace_message(STATUS
|
||||
"${TEST_NAME} requires a GPU and no valid GPUs were found")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if("${TEST_MPI}" STREQUAL "")
|
||||
set(TEST_MPI OFF)
|
||||
endif()
|
||||
@@ -189,7 +303,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
add_test(
|
||||
NAME ${TEST_NAME}-baseline
|
||||
COMMAND ${COMMAND_PREFIX} $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
endif()
|
||||
|
||||
if(NOT TEST_SKIP_REWRITE)
|
||||
@@ -199,7 +313,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
$<TARGET_FILE:omnitrace-exe> -o
|
||||
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.inst
|
||||
${TEST_REWRITE_ARGS} -- $<TARGET_FILE:${TEST_TARGET}>
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
|
||||
if(NOT TEST_SKIP_SAMPLING)
|
||||
add_test(
|
||||
@@ -208,7 +322,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
$<TARGET_FILE:omnitrace-exe> -o
|
||||
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.samp -M sampling
|
||||
${TEST_REWRITE_ARGS} -- $<TARGET_FILE:${TEST_TARGET}>
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
endif()
|
||||
|
||||
add_test(
|
||||
@@ -216,7 +330,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
COMMAND
|
||||
${COMMAND_PREFIX} $<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.inst
|
||||
${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
|
||||
if(NOT TEST_SKIP_SAMPLING)
|
||||
add_test(
|
||||
@@ -225,7 +339,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
${COMMAND_PREFIX}
|
||||
$<TARGET_FILE_DIR:${TEST_TARGET}>/${TEST_NAME}.samp
|
||||
${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -234,14 +348,14 @@ function(OMNITRACE_ADD_TEST)
|
||||
NAME ${TEST_NAME}-runtime-instrument
|
||||
COMMAND $<TARGET_FILE:omnitrace-exe> ${TEST_RUNTIME_ARGS} --
|
||||
$<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
|
||||
if(NOT TEST_SKIP_SAMPLING)
|
||||
add_test(
|
||||
NAME ${TEST_NAME}-runtime-instrument-sampling
|
||||
COMMAND $<TARGET_FILE:omnitrace-exe> -M sampling ${TEST_RUNTIME_ARGS}
|
||||
-- $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:${TEST_TARGET}>)
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -303,6 +417,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
endforeach()
|
||||
|
||||
if(TEST ${TEST_NAME}-${_TEST})
|
||||
omnitrace_write_test_config(${TEST_NAME}-${_TEST}.cfg _environ)
|
||||
set_tests_properties(
|
||||
${TEST_NAME}-${_TEST}
|
||||
PROPERTIES ENVIRONMENT
|
||||
@@ -431,6 +546,7 @@ endfunction()
|
||||
# general config file tests
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
file(
|
||||
WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg
|
||||
"
|
||||
@@ -476,6 +592,7 @@ omnitrace_add_test(
|
||||
NAME transpose
|
||||
TARGET transpose
|
||||
MPI ${TRANSPOSE_USE_MPI}
|
||||
GPU ON
|
||||
NUM_PROCS ${NUM_PROCS}
|
||||
REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution
|
||||
RUNTIME_ARGS
|
||||
@@ -497,6 +614,7 @@ omnitrace_add_test(
|
||||
TARGET transpose
|
||||
LABELS "loops"
|
||||
MPI ${TRANSPOSE_USE_MPI}
|
||||
GPU ON
|
||||
NUM_PROCS ${NUM_PROCS}
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
@@ -577,78 +695,82 @@ omnitrace_add_test(
|
||||
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF"
|
||||
REWRITE_FAIL_REGEX "0 instrumented loops in procedure")
|
||||
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME
|
||||
NAME "mpi"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')"
|
||||
REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)")
|
||||
if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS)
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME
|
||||
NAME "mpi"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')"
|
||||
REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)")
|
||||
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME "mpi-flat-mpip"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
LABELS "mpip"
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_USE_MPIP=ON"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
|
||||
)
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME "mpi-flat-mpip"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
LABELS "mpip"
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT
|
||||
"${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
|
||||
)
|
||||
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME "mpi-flat"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
LABELS "mpip"
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
|
||||
)
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME "mpi-flat"
|
||||
TARGET mpi-example
|
||||
MPI ON
|
||||
NUM_PROCS 4
|
||||
LABELS "mpip"
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join"
|
||||
)
|
||||
endif()
|
||||
|
||||
omnitrace_add_test(
|
||||
NAME lulesh
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos"
|
||||
REWRITE_ARGS -e -v 2 --label file line return args
|
||||
@@ -674,6 +796,7 @@ omnitrace_add_test(
|
||||
NAME lulesh-baseline-kokkosp-libomnitrace
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos;kokkos-profile-library"
|
||||
RUN_ARGS -i 10 -s 20 -p
|
||||
@@ -686,6 +809,7 @@ omnitrace_add_test(
|
||||
NAME lulesh-baseline-kokkosp-libomnitrace-dl
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos;kokkos-profile-library"
|
||||
RUN_ARGS -i 10 -s 20 -p
|
||||
@@ -698,6 +822,7 @@ omnitrace_add_test(
|
||||
NAME lulesh-kokkosp
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos"
|
||||
REWRITE_ARGS -e -v 2
|
||||
@@ -721,6 +846,7 @@ omnitrace_add_test(
|
||||
NAME lulesh-perfetto
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos;loops"
|
||||
REWRITE_ARGS -e -v 2
|
||||
@@ -743,6 +869,7 @@ omnitrace_add_test(
|
||||
NAME lulesh-timemory
|
||||
TARGET lulesh
|
||||
MPI ${LULESH_USE_MPI}
|
||||
GPU ${LULESH_USE_GPU}
|
||||
NUM_PROCS 8
|
||||
LABELS "kokkos;loops"
|
||||
REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping
|
||||
@@ -965,6 +1092,64 @@ if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE)
|
||||
"Dyninst was unable to attach to the specified process")
|
||||
endif()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
#
|
||||
# rccl tests
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
foreach(_TARGET ${RCCL_TEST_TARGETS})
|
||||
string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}")
|
||||
string(REPLACE "_" "-" _NAME "${_NAME}")
|
||||
omnitrace_add_test(
|
||||
SKIP_SAMPLING
|
||||
NAME rccl-test-${_NAME}
|
||||
TARGET ${_TARGET}
|
||||
LABELS "rccl-tests;rcclp"
|
||||
MPI ON
|
||||
GPU ON
|
||||
NUM_PROCS 1
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
2
|
||||
-i
|
||||
8
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
RUNTIME_ARGS
|
||||
-e
|
||||
-v
|
||||
1
|
||||
-i
|
||||
8
|
||||
--label
|
||||
file
|
||||
line
|
||||
return
|
||||
args
|
||||
RUN_ARGS -t
|
||||
1
|
||||
-g
|
||||
1
|
||||
-i
|
||||
10
|
||||
-w
|
||||
2
|
||||
-m
|
||||
2
|
||||
-p
|
||||
-c
|
||||
1
|
||||
-z
|
||||
-s
|
||||
1
|
||||
ENVIRONMENT "${_rccl_environment}")
|
||||
endforeach()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
#
|
||||
# python tests
|
||||
|
||||
在新工单中引用
屏蔽一个用户