Files
rocm-systems/.github/workflows/continuous_integration.yml
T
Jonathan R. Madsen 2f9b1767e9 Handle hsa_queue_destroy after finalization (#679)
* Handle hsa_queue_destroy after finalization

- fixes issue where hsa_queue_destroy(...) is invoked after rocprofiler-sdk has finalized
- hsa::get_queue_controller() returns pointer
- if queue controller is a null pointer, skip invoking QueueController::destroy_queue

* Update HIP/HSA/marker update_table logging

* Update rocprofv3 tests

- remove HSA_TOOLS_LIB env variable
- remove setting ROCPROFILER_LOG_LEVEL env variable
- add timeouts to tests which are missing them

* Disable thread sanitizer deadlock detection

* Update CI workflow

- rename vega20-ubuntu job to core-ci
- enable navi32 in core-ci and sanitizers

* Update run-ci.py

- set gcovr html medium and high threshold

* Update lib/rocprofiler-sdk/hsa/queue_controller.cpp

- remove this capture from enable/disable serialization

* Update lib/rocprofiler-sdk/hsa/{hsa_barrier,profile_serializer}.*

- hsa_barrier::set_barrier accepts const-ref to queue map
- profile_serializer::enable and profile_serializer::disable accept const-ref to queue map

* Logging for HIP/HSA/marker/profile_serializer

* Logging for HIP/HSA/marker/queue_controller

* Improve test_retired_correlation_ids asserts

* Fix tests/counter-collection/validate.py

- scale expected SQ_WAVES counter value based on warp size of GPU

* Tweak github comment for code coverage

* Remove gcovr html high/medium threshold args

* Fix tests/counter-collection/validate.py

- round before casting to int in test_counter_values

* operator bool for profile_serializer

- only wait on CV if profile_serializer is used

* Logging updates (profile_serializer + code_object)

* Update counter-collection validate.py

* QueueController does not wait on CV if finalizing/finalized

* Update CI workflow

- remove navi32 from core job

* Improve HIP/HSA/marker tracing get_functor/functor

- remove lambda wrapper around functor

* Update lib/rocprofiler-sdk/hsa/queue_controller.cpp

- do not acquire cvmutex lock during finalization

* Update lib/rocprofiler-sdk/hsa/hsa_barrier.*

- move ctor and dtor to implementation
- skip signal store screlease and destroy if already finalized

* Update CI workflow

- remove navi32 runners

* bwelton fixes for hangs

* CMake improvements + simplified demangle

- remove amd-comgr from common target (and thus removed from roctx DT_NEEDED)

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
2024-03-21 17:52:15 -05:00

446 wiersze
16 KiB
YAML

name: Continuous Integration
on:
workflow_dispatch:
push:
branches: [ "main" ]
paths-ignore:
- '*.md'
- 'source/docs/**'
pull_request:
branches: [ "main" ]
paths-ignore:
- '*.md'
- 'source/docs/**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
# TODO(jrmadsen): replace LD_RUNPATH_FLAG, GPU_LIST, etc. with internal handling in cmake
ROCM_PATH: "/opt/rocm"
GPU_LIST: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
PATH: "/usr/bin:$PATH"
jobs:
core:
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
strategy:
fail-fast: true
matrix:
os: ['ubuntu-22.04']
runner: ['vega20']
build-type: ['RelWithDebInfo']
ci-flags: ['--linter clang-tidy']
name-tag: ['']
runs-on: ${{ matrix.runner }}
# define this for containers
env:
GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
container:
# TODO(aelwazir): Adding multiple os distros
image: evuedsoacr.azurecr.io/ml-lang-comp-us/rocm-tools/rocm-rocprofiler-sdk-staging-base:latest
options: --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
credentials:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Install requirements
shell: bash
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y cmake clang-tidy-11 g++-11 g++-12 python3-pip
update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 10
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 --slave /usr/bin/g++ g++ /usr/bin/g++-11
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 20 --slave /usr/bin/g++ g++ /usr/bin/g++-12
python3 -m pip install -r requirements.txt
python3 -m pip install pytest
python3 -m pip install 'cmake>=3.22.0'
- name: List Files
shell: bash
run: |
which-realpath() { echo -e "\n$1 resolves to $(realpath $(which $1))"; echo "$($(which $1) --version &> /dev/stdout | head -n 1)"; }
for i in python python3 git cmake ctest; do which-realpath $i; done
ls -la
- name: Configure, Build, and Test
timeout-minutes: 30
shell: bash
run:
python3 ./source/scripts/run-ci.py -B build
--name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}${{ matrix.name-tag }}
--build-jobs 8
--site ${{ matrix.runner }}
--gpu-targets ${{ env.GPU_LIST }}
${{ matrix.ci-flags }}
--
-DROCPROFILER_DEP_ROCMCORE=ON
-DROCPROFILER_BUILD_DOCS=ON
-DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
-DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-sdk
-DCPACK_GENERATOR='DEB;RPM;TGZ'
-DCPACK_PACKAGING_INSTALL_PREFIX="$(realpath /opt/rocm)"
-DPython3_EXECUTABLE=$(which python3)
- name: Install
timeout-minutes: 10
run:
cmake --build build --target install --parallel 8
- name: Build Packaging
timeout-minutes: 10
run:
cmake --build build --target package --parallel 8
- name: Test Install Build
timeout-minutes: 10
shell: bash
run: |
CMAKE_PREFIX_PATH=/opt/rocprofiler-sdk cmake -B build-samples samples
CMAKE_PREFIX_PATH=/opt/rocprofiler-sdk cmake -B build-tests tests
export LD_LIBRARY_PATH=/opt/rocprofiler-sdk/lib:${LD_LIBRARY_PATH}
cmake --build build-samples --target all --parallel 8
cmake --build build-tests --target all --parallel 8
ctest --test-dir build-samples --output-on-failure
ctest --test-dir build-tests --output-on-failure
- name: Install Packages
timeout-minutes: 5
shell: bash
run: |
export PATH=${PATH}:/usr/local/sbin:/usr/sbin:/sbin
ls -la
ls -la ./build
for i in $(ls -S ./build/rocprofiler-sdk*.deb); do dpkg -i ${i}; done;
- name: Test Installed Packages
timeout-minutes: 20
shell: bash
run: |
CMAKE_PREFIX_PATH=/opt/rocm cmake -B build-samples-deb /opt/rocm/share/rocprofiler-sdk/samples
CMAKE_PREFIX_PATH=/opt/rocm cmake -B build-tests-deb /opt/rocm/share/rocprofiler-sdk/tests
cmake --build build-samples-deb --target all --parallel 8
cmake --build build-tests-deb --target all --parallel 8
ctest --test-dir build-samples-deb --output-on-failure
ctest --test-dir build-tests-deb --output-on-failure
- name: Archive production artifacts
uses: actions/upload-artifact@v4
with:
name: installers
path: |
${{github.workspace}}/build/*.deb
${{github.workspace}}/build/*.rpm
${{github.workspace}}/build/*.tgz
code-coverage:
strategy:
fail-fast: true
matrix:
# TODO: Change it back when re-enabling on mi200
include:
- os: 'ubuntu-22.04'
runner: 'vega20'
build-type: 'Release'
# include:
# - os: 'ubuntu-22.04'
# runner: 'renderD131'
# device: '/renderD131'
# build-type: 'Release'
runs-on: ${{ matrix.runner }}
# define this for containers
env:
GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
# TODO: Uncomment this when re-enabling tests on the mi200 as it contains --memory and --cpus flag for the mi200. Remove these 2 options when running on vega20.
# vega20 machine only has 24 cpus available.
container:
# TODO(aelwazir): Adding multiple os distros
image: evuedsoacr.azurecr.io/ml-lang-comp-us/rocm-tools/rocm-rocprofiler-sdk-staging-base:latest
options: --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
credentials:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
# container:
# # TODO(aelwazir): Adding multiple os distros
# image: evuedsoacr.azurecr.io/ml-lang-comp-us/rocm-tools/rocm-rocprofiler-sdk-staging-base:latest
# options: --memory=128g --cpus=32 --ipc=host --device=/dev/kfd --device=/dev/dri${{ matrix.device }} --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
# credentials:
# username: ${{ secrets.DOCKER_USERNAME }}
# password: ${{ secrets.DOCKER_PASSWORD }}
steps:
- name: Patch Git
timeout-minutes: 25
run: |
apt-get update
apt-get install -y software-properties-common
add-apt-repository -y ppa:git-core/ppa
apt-get update
apt-get install -y git
- uses: actions/checkout@v4
with:
submodules: true
- name: Load Existing XML Code Coverage
if: github.event_name == 'pull_request'
id: load-coverage
uses: actions/cache@v4
with:
key: ${{ github.event.pull_request.base.sha }}-codecov
path: .codecov/**
- name: Copy Existing XML Code Coverage
if: github.event_name == 'pull_request'
shell: bash
run: |
if [ -d .codecov ]; then cp -r .codecov .codecov.ref; fi
- name: Configure Env
shell: bash
run: |
echo "${PATH}:/usr/local/bin:${HOME}/.local/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib:${HOME}/.local/lib" >> $GITHUB_ENV
- name: List Files
shell: bash
run: |
echo "PATH: ${PATH}"
echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}"
which-realpath() { echo -e "\n$1 resolves to $(realpath $(which $1))"; echo "$($(which $1) --version &> /dev/stdout | head -n 1)"; }
for i in python python3 git cmake ctest; do which-realpath $i; done
ls -la
- name: Install requirements
shell: bash
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig
python3 -m pip install -r requirements.txt
python3 -m pip install pytest pycobertura
- name: Configure, Build, and Test (Total Code Coverage)
timeout-minutes: 30
shell: bash
run:
python3 ./source/scripts/run-ci.py -B build
--name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}-codecov
--build-jobs 8
--site ${{ matrix.runner }}
--gpu-targets ${{ env.GPU_LIST }}
--coverage all
--
-DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
-DPython3_EXECUTABLE=$(which python3)
- name: Configure, Build, and Test (Tests Code Coverage)
timeout-minutes: 30
shell: bash
run:
find build -type f | egrep '\.gcda$' | xargs rm &&
python3 ./source/scripts/run-ci.py -B build
--name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}-codecov-tests
--build-jobs 8
--site ${{ matrix.runner }}
--gpu-targets ${{ env.GPU_LIST }}
--coverage tests
--
-DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
-DPython3_EXECUTABLE=$(which python3)
- name: Configure, Build, and Test (Samples Code Coverage)
timeout-minutes: 30
shell: bash
run:
find build -type f | egrep '\.gcda$' | xargs rm &&
python3 ./source/scripts/run-ci.py -B build
--name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}-codecov-samples
--build-jobs 8
--site ${{ matrix.runner }}
--gpu-targets ${{ env.GPU_LIST }}
--coverage samples
--
-DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
-DPython3_EXECUTABLE=$(which python3)
- name: Save XML Code Coverage
id: save-coverage
uses: actions/cache/save@v4
with:
key: ${{ github.sha }}-codecov
path: |
.codecov/*.xml
- name: Generate Code Coverage Comment
if: github.event_name == 'pull_request'
timeout-minutes: 5
shell: bash
run: |
echo "PWD: ${PWD}"
ls -la
for i in "all" "tests" "samples"; do
wkhtmltoimage --enable-local-file-access --quality 85 .codecov/${i}.html .codecov/${i}.png
done
ls -la .codecov
which -a git
git --version
./source/scripts/upload-image-to-github.py --bot --token ${{ github.token }} --files .codecov/{all,tests,samples}.png --output-dir .codecov --name pr-${{ github.event.pull_request.number }}
echo -e "\n${PWD}:"
ls -la .
echo -e "\n.codecov:"
ls -la .codecov
get-base-md-contents() { cat .codecov/${1}.png.md; }
get-full-md-contents() { cat .codecov/${1}.png.md .codecov/${1}.md; }
cat << EOF > .codecov/report.md
# Code Coverage Report
## Tests Only
$(get-base-md-contents tests)
## Samples Only
$(get-base-md-contents samples)
## Tests + Samples
$(get-full-md-contents all)
EOF
- name: Write Code Coverage Comment
if: github.event_name == 'pull_request'
timeout-minutes: 5
uses: thollander/actions-comment-pull-request@v2.5.0
with:
comment_tag: codecov-report
filePath: .codecov/report.md
- name: Archive Code Coverage Data
uses: actions/upload-artifact@v4
with:
name: code-coverage-details
path: |
${{github.workspace}}/.codecov/*
- name: Verify Test Labels
timeout-minutes: 5
shell: bash
run: |
pushd build
#
# if following fails, there is a test that does not have
# a label identifying it as sample or test (unit or integration).
# Recommended labels are:
# - samples
# - unittests
# - integration-tests
#
ctest -N -LE 'samples|tests' -O ctest.mislabeled.log
grep 'Total Tests: 0' ctest.mislabeled.log
#
# if following fails, then there is overlap between the labels.
# A test cannot both be a sample and (unit/integration) test.
#
ctest -N -O ctest.all.log
ctest -N -O ctest.samples.log -L samples
ctest -N -O ctest.tests.log -L tests
NUM_ALL=$(grep 'Total Tests:' ctest.all.log | awk '{print $NF}')
NUM_SAMPLE=$(grep 'Total Tests:' ctest.samples.log | awk '{print $NF}')
NUM_TEST=$(grep 'Total Tests:' ctest.tests.log | awk '{print $NF}')
NUM_SUM=$((${NUM_SAMPLE} + ${NUM_TEST}))
echo "Total tests: ${NUM_ALL}"
echo "Total labeled tests: ${NUM_SUM}"
if [ ${NUM_ALL} != ${NUM_SUM} ]; then
echo "Test label overlap"
exit 1
fi
popd
sanitizers:
strategy:
fail-fast: false
matrix:
os: ['ubuntu-22.04']
runner: ['vega20']
build-type: ['RelWithDebInfo']
sanitizer: ['AddressSanitizer', 'ThreadSanitizer', 'LeakSanitizer']
ci-flags: ['']
runs-on: ${{ matrix.runner }}
# define this for containers
env:
GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
container:
# TODO(aelwazir): Adding multiple os distros
image: evuedsoacr.azurecr.io/ml-lang-comp-us/rocm-tools/rocm-rocprofiler-sdk-staging-base:latest
options: --privileged --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
credentials:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: List Files
shell: bash
run: |
which-realpath() { echo -e "\n$1 resolves to $(realpath $(which $1))"; echo "$($(which $1) --version &> /dev/stdout | head -n 1)"; }
for i in python python3 git cmake ctest; do which-realpath $i; done
ls -la
- name: Install requirements
shell: bash
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y cmake python3-pip libasan8 libtsan2 software-properties-common
python3 -m pip install -r requirements.txt
python3 -m pip install pytest
add-apt-repository ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-13
update-alternatives --install $(which gcc) gcc $(which gcc-13) 100 --slave $(which g++) g++ $(which g++-13)
realpath $(which gcc)
realpath $(which g++)
gcc --version
g++ --version
- name: Configure, Build, and Test
timeout-minutes: 45
shell: bash
run:
python3 ./source/scripts/run-ci.py -B build
--name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}-${{ matrix.sanitizer }}
--build-jobs 8
--site ${{ matrix.runner }}
--gpu-targets ${{ env.GPU_LIST }}
--memcheck=${{ matrix.sanitizer }}
${{ matrix.ci-flags }}
--
-DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
-DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
-DPython3_EXECUTABLE=$(which python3)