Added AMDSMI CI to rocm-systems(#2074)

Signed-off-by: Justin Williams <Justin.Williams@amd.com>
This commit is contained in:
amd-juwillia
2025-12-16 12:52:42 -07:00
کامیت شده توسط GitHub
والد c9ac018395
کامیت 3a3738ad98
4فایلهای تغییر یافته به همراه1200 افزوده شده و 3 حذف شده
@@ -0,0 +1,878 @@
name: AMDSMI CI
on:
pull_request:
branches: [develop]
paths:
- 'projects/amdsmi/**'
- '.github/workflows/amdsmi-build.yml'
push:
branches: [develop]
paths:
- 'projects/amdsmi/**'
- '.github/workflows/amdsmi-build.yml'
workflow_dispatch:
permissions:
contents: read
env:
DEBIAN_FRONTEND: noninteractive
DEBCONF_NONINTERACTIVE_SEEN: true
BUILD_TYPE: Release
ROCM_DIR: /opt/rocm
jobs:
debian-buildinstall:
name: Build
runs-on:
- self-hosted
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
max-parallel: 10
matrix:
os: [Ubuntu20, Ubuntu22, Debian10]
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root
steps:
- uses: actions/checkout@v4
- name: Set Project Directory
run: |
# Find the directory containing the main CMakeLists.txt for AMDSMI
TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;)
if [ -z "$TARGET_DIR" ]; then
echo "Could not find CMakeLists.txt in projects/amdsmi. Searching root..."
TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1)
fi
echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV
- name: Update repositories for Debian10
if: matrix.os == 'Debian10'
run: |
set -e
echo 'Updating repositories for Debian10 (archived)'
cat > /etc/apt/sources.list << EOF
deb http://archive.debian.org/debian buster main
deb http://archive.debian.org/debian-security buster/updates main
EOF
echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until
apt update
- name: Build AMDSMI
run: |
set -e
echo 'Building on ${{ matrix.os }}'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Build attempt $i for ${{ matrix.os }}..."
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
# Configure, build, and package
if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \
make -j $(nproc) 2>&1 | tee make.log && \
make package 2>&1 | tee package.log; then
# Parse and report warnings as GitHub annotations
echo "::group::Build Warnings"
grep -i "warning" cmake.log make.log package.log | while read -r line; do
echo "::warning::$line"
done
echo "::endgroup::"
echo "Build successful on attempt $i"
break
else
echo "Build failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES build attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo "Build completed on ${{ matrix.os }}"
- name: Install AMDSMI
run: |
cd ${{ env.PROJECT_DIR }}/build
if [ "${{ matrix.os }}" != "Debian10" ]; then
apt update
fi
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Installation attempt $i for ${{ matrix.os }}..."
if apt install -y ./amd-smi-lib*99999-local_amd64.deb; then
echo "Installation successful on attempt $i"
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
# Verify Installation
echo 'Verifying installation:'
amd-smi version
python3 -m pip list | grep amd
python3 -m pip list | grep pip
python3 -m pip list | grep setuptools
echo 'Completed installation on ${{ matrix.os }}'
break
else
echo "Installation failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES installation attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo "Build completed on ${{ matrix.os }}"
- name: Uninstall
if: always()
run: |
set -e
echo 'Uninstalling on ${{ matrix.os }}'
apt remove -y amd-smi-lib || true
rm -f /usr/local/bin/amd-smi
if [ -d /opt/rocm/share/amd_smi ]; then
echo '/opt/rocm/share/amd_smi exists. Removing.'
rm -rf /opt/rocm/share/amd_smi
fi
echo 'Uninstall done on ${{ matrix.os }}'
debian-test:
name: Tests
needs: debian-buildinstall
runs-on:
- self-hosted
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
max-parallel: 10
matrix:
os: [Ubuntu20, Ubuntu22, Debian10]
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root
steps:
- uses: actions/checkout@v4
- name: Set Project Directory
run: |
TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;)
if [ -z "$TARGET_DIR" ]; then
TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1)
fi
echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV
- name: Update repositories for Debian10
if: matrix.os == 'Debian10'
run: |
set -e
echo 'Updating repositories for Debian10 (archived)'
cat > /etc/apt/sources.list << EOF
deb http://archive.debian.org/debian buster main
deb http://archive.debian.org/debian-security buster/updates main
EOF
echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until
apt update
- name: Build and Install for Test
run: |
set -e
echo 'Building for test on ${{ matrix.os }}'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Build attempt $i for ${{ matrix.os }} test..."
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \
make -j $(nproc) && \
make package; then
echo "Build successful on attempt $i"
break
else
echo "Build failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES build attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo 'Installing for test on ${{ matrix.os }}'
for i in $(seq 1 $RETRIES); do
echo "Installation attempt $i for test on ${{ matrix.os }}..."
if apt install -y $BUILD_FOLDER/amd-smi-lib*99999-local_amd64.deb; then
echo "Installation successful on attempt $i"
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
echo 'Install done for test on ${{ matrix.os }}'
break
else
echo "Installation failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES installation attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
- name: AMDSMI Command Tests
shell: bash
run: |
set -e
echo "Running AMDSMI commands on ${{ matrix.os }}"
mkdir -p /tmp/test-results-${{ matrix.os }}
commands=(
"amd-smi version"
"amd-smi list"
"amd-smi static"
"amd-smi firmware"
"amd-smi ucode"
"amd-smi bad-pages"
"amd-smi metric"
"amd-smi process"
"amd-smi topology"
"amd-smi monitor"
"amd-smi dmon"
"amd-smi xgmi"
"amd-smi partition"
)
for cmd in "${commands[@]}"; do
debug_cmd="$cmd --loglevel debug"
echo "Running: $debug_cmd"
if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then
echo "Command '$debug_cmd' failed."
cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log
exit 1
else
echo "$debug_cmd passed."
fi
done
echo "AMDSMI commands done on ${{ matrix.os }}"
- name: Upload AMDSMI Command Test Results
if: always()
uses: actions/upload-artifact@v4
with:
name: amdsmi-command-tests-${{ matrix.os }}
path: /tmp/test-results-${{ matrix.os }}
- name: Run AMDSMI, Python, and Example Tests
shell: bash
run: |
set -e
echo 'Running other tests on ${{ matrix.os }}'
# AMDSMI Tests
echo 'Running AMDSMI tests'
cd /opt/rocm/share/amd_smi/tests
source amdsmitst.exclude
AMDSMI_RETRIES=3
for attempt in $(seq 1 $AMDSMI_RETRIES); do
echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..."
if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then
echo "AMDSMI tests passed on attempt $attempt"
echo "=============== TEST OUTPUT ==============="
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]"
echo "=============================================="
echo "AMDSMI tests done"
break
else
TEST_EXIT_CODE=$?
echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE"
if [ $attempt -eq $AMDSMI_RETRIES ]; then
echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure."
echo "=============== TEST OUTPUT ==============="
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]"
echo "=============================================="
echo "AMDSMI tests failed"
exit $TEST_EXIT_CODE
else
echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..."
sleep $((2 * attempt))
fi
fi
done
# Python Tests
echo 'Running Python tests'
cd /opt/rocm/share/amd_smi/tests/python_unittest
echo "Running integration tests..."
if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then
echo "Integration tests failed!"
echo "=============== INTEGRATION TEST OUTPUT ==============="
tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt
echo "======================================================="
exit 1
else
echo "Integration tests passed"
fi
echo "Running unit tests..."
if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then
echo "Unit tests failed!"
echo "=============== UNIT TEST OUTPUT ==============="
tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt
echo "================================================"
exit 1
else
echo "Unit tests passed"
fi
echo "Python tests done"
# Example Tests
echo 'Running Example tests'
cd ${{ env.PROJECT_DIR }}/example
rm -rf build
cmake -B build -DENABLE_ESMI_LIB=OFF
make -C build -j $(nproc)
cd build
./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed'
./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed'
echo "Example tests done"
- name: AMDSMI Test Results
if: always()
run: |
echo "Displaying AMDSMI test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}"
- name: Integration Test Results
if: always()
run: |
echo "Displaying Integration test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}"
- name: Unit Test Results
if: always()
run: |
echo "Displaying Unit Test Results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}"
- name: Example DRM Test Results
if: always()
run: |
echo "Displaying Example DRM test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}"
- name: Example NoDRM Test Results
if: always()
run: |
echo "Displaying Example NoDRM test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}"
rpm-buildinstall:
name: Build
runs-on:
- self-hosted
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
max-parallel: 10
matrix:
os:
- SLES
- RHEL8
- RHEL9
- RHEL10
- AzureLinux3
- AlmaLinux8
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root
steps:
- uses: actions/checkout@v4
- name: Set Project Directory
run: |
TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;)
if [ -z "$TARGET_DIR" ]; then
TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1)
fi
echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV
- name: Set PkgMgr
run: |
set -e
case "${{ matrix.os }}" in
SLES)
echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV
;;
RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3)
echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV
;;
esac
- name: Add more_itertools
if: matrix.os == 'AzureLinux3'
run: |
set -e
echo 'Installing more_itertools on ${{ matrix.os }}'
python3 -m pip install more_itertools
- name: Build AMDSMI(RHEL10 & AlmaLinux8)
if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8'
run: |
set -e
echo 'Building on ${{ matrix.os }} with retries and QA_RPATHS'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
RETRIES=5
# Set QA_RPATHS to ignore empty (0x0010) and invalid (0x0002) RPATHs
export QA_RPATHS=$((0x0010 | 0x0002))
for i in $(seq 1 $RETRIES); do
echo "Build attempt $i for ${{ matrix.os }} ..."
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \
make -j $(nproc) && \
make package; then
echo "Build successful on attempt $i"
break
else
echo "Build failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES build attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo "Build completed on ${{ matrix.os }}"
- name: Build AMDSMI
if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8'
run: |
set -e
echo 'Building on ${{ matrix.os }}'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Build attempt $i for ${{ matrix.os }}..."
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
# Capture build output to parse warnings
if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \
make -j $(nproc) 2>&1 | tee make.log && \
make package 2>&1 | tee package.log; then
# Parse and report warnings as GitHub annotations
echo "::group::Build Warnings"
grep -i "warning" cmake.log make.log package.log | while read -r line; do
echo "::warning::$line"
done
echo "::endgroup::"
echo "Build successful on attempt $i"
break
else
echo "Build failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES build attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo "Build completed on ${{ matrix.os }}"
- name: Install AMDSMI(RHEL10 & AlmaLinux8)
if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8'
run: |
cd ${{ env.PROJECT_DIR }}/build
dnf install python3-setuptools python3-wheel -y
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "RHEL10: Installation attempt $i..."
if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then
echo "Installation successful on attempt $i"
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
echo 'Verifying installation:'
amd-smi version
python3 -m pip list | grep amd
python3 -m pip list | grep pip
python3 -m pip list | grep setuptools
echo 'Completed installation on RHEL10'
break
else
echo "Installation failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES installation attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
- name: Install AMDSMI
if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8'
run: |
cd ${{ env.PROJECT_DIR }}/build
case ${{ env.PACKAGE_MANAGER }} in
zypper)
timeout 10m zypper --no-refresh --no-gpg-checks install -y ./amd-smi-lib-*99999-local*.rpm
;;
dnf)
dnf install python3-setuptools python3-wheel -y
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Attempt $i: Installing AMDSMI package..."
if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then
echo "AMDSMI package installed successfully."
break
else
echo "Installation failed on attempt $i. Retrying..."
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES attempts failed. Exiting."
exit 1
fi
sleep 10
fi
done
;;
esac
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
# Verify Installation
echo 'Verifying installation:'
amd-smi version
python3 -m pip list | grep amd
python3 -m pip list | grep pip
python3 -m pip list | grep setuptools
echo 'Completed installation on ${{ matrix.os }}'
- name: Uninstall
if: always()
run: |
set -e
echo 'Uninstalling on ${{ matrix.os }}'
case ${{ matrix.os }} in
SLES)
zypper remove -y amd-smi-lib || true
;;
RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3)
dnf remove -y amd-smi-lib || true
;;
esac
rm -f /usr/local/bin/amd-smi
if [ -d /opt/rocm/share/amd_smi ]; then
echo '/opt/rocm/share/amd_smi exists. Removing.'
rm -rf /opt/rocm/share/amd_smi
fi
echo 'Uninstall done on ${{ matrix.os }}'
rpm-test:
name: Tests
needs: [rpm-buildinstall, debian-test]
runs-on:
- self-hosted
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
max-parallel: 10
matrix:
os:
- SLES
- RHEL8
- RHEL9
- RHEL10
- AzureLinux3
- AlmaLinux8
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root
steps:
- uses: actions/checkout@v4
- name: Set Project Directory
run: |
TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;)
if [ -z "$TARGET_DIR" ]; then
TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1)
fi
echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV
- name: Set PkgMgr
run: |
set -e
case "${{ matrix.os }}" in
SLES)
echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV
;;
RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3)
echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV
;;
esac
- name: Add more_itertools
if: matrix.os == 'AzureLinux3'
run: |
set -e
echo 'Installing more_itertools on ${{ matrix.os }}'
python3 -m pip install more_itertools
- name: Build and Install for Tests (RHEL10 & AlmaLinux8)
if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8'
run: |
set -e
echo 'Building for test on RHEL10/AlmaLinux8 with retries and QA_RPATHS'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
RETRIES=5
# Set QA_RPATHS to ignore empty (0x0010 | 0x0002) RPATHs
export QA_RPATHS=$((0x0010 | 0x0002))
for i in $(seq 1 $RETRIES); do
echo "Build attempt $i for RHEL10/AlmaLinux8 test..."
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \
make -j $(nproc) && \
make package; then
echo "Build successful on attempt $i"
break
else
echo "Build failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES build attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
echo 'Installing for test on RHEL10/AlmaLinux8'
dnf install python3-setuptools python3-wheel -y
for i in $(seq 1 $RETRIES); do
echo "RHEL10/AlmaLinux8: Installation attempt $i for test..."
if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then
echo "Installation successful on attempt $i"
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
echo 'Install done for test on RHEL10/AlmaLinux8'
break
else
echo "Installation failed on attempt $i"
if [ $i -eq $RETRIES ]; then
echo "All $RETRIES installation attempts failed. Exiting."
exit 1
fi
sleep $((2 * i))
fi
done
- name: Build and Install for Tests
if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8'
run: |
set -e
echo 'Building for test on ${{ matrix.os }}'
BUILD_FOLDER=${{ env.PROJECT_DIR }}/build
rm -rf $BUILD_FOLDER
mkdir -p $BUILD_FOLDER
cd $BUILD_FOLDER
cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON
make -j $(nproc)
make package
echo 'Installing for test on ${{ matrix.os }}'
case ${{ env.PACKAGE_MANAGER }} in
zypper)
timeout 10m zypper --no-refresh --no-gpg-checks install -y $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm
;;
dnf)
dnf install python3-setuptools python3-wheel -y
RETRIES=3
for i in $(seq 1 $RETRIES); do
echo "Attempt $i: Installing..."
if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then
echo "Install successful."
break
else
echo "Attempt $i failed. Retrying..."
if [ $i -eq $RETRIES ]; then
echo "All attempts failed."
exit 1
fi
sleep 10
fi
done
;;
esac
ln -s /opt/rocm/bin/amd-smi /usr/local/bin
echo 'Install done for test on ${{ matrix.os }}'
- name: AMDSMI Command Tests
shell: bash
run: |
set -e
echo "Running AMDSMI commands on ${{ matrix.os }}"
mkdir -p /tmp/test-results-${{ matrix.os }}
commands=(
"amd-smi version"
"amd-smi list"
"amd-smi static"
"amd-smi firmware"
"amd-smi ucode"
"amd-smi bad-pages"
"amd-smi metric"
"amd-smi process"
"amd-smi topology"
"amd-smi monitor"
"amd-smi dmon"
"amd-smi xgmi"
"amd-smi partition"
)
for cmd in "${commands[@]}"; do
debug_cmd="$cmd --loglevel debug"
echo "Running: $debug_cmd"
if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then
echo "Command '$debug_cmd' failed."
cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log
exit 1
else
echo "$debug_cmd passed."
fi
done
echo "AMDSMI commands done on ${{ matrix.os }}"
- name: Upload AMDSMI Command Test Results
if: always()
uses: actions/upload-artifact@v4
with:
name: amdsmi-command-tests-${{ matrix.os }}
path: /tmp/test-results-${{ matrix.os }}
- name: Run AMDSMI, Python, and Example Tests
shell: bash
run: |
set -e
echo 'Running other tests on ${{ matrix.os }}'
# AMDSMI Tests
echo 'Running AMDSMI tests'
cd /opt/rocm/share/amd_smi/tests
source amdsmitst.exclude
AMDSMI_RETRIES=3
for attempt in $(seq 1 $AMDSMI_RETRIES); do
echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..."
if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then
echo "AMDSMI tests passed on attempt $attempt"
echo "=============== TEST OUTPUT ==============="
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]"
echo "=============================================="
echo "AMDSMI tests done"
break
else
TEST_EXIT_CODE=$?
echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE"
if [ $attempt -eq $AMDSMI_RETRIES ]; then
echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure."
echo "=============== TEST OUTPUT ==============="
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]"
echo "=============================================="
echo "AMDSMI tests failed"
exit $TEST_EXIT_CODE
else
echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..."
sleep $((2 * attempt))
fi
fi
done
# Python Tests
echo 'Running Python tests'
cd /opt/rocm/share/amd_smi/tests/python_unittest
echo "Running integration tests..."
if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then
echo "Integration tests failed!"
echo "=============== INTEGRATION TEST OUTPUT ==============="
tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt
echo "======================================================="
exit 1
else
echo "Integration tests passed"
fi
echo "Running unit tests..."
if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then
echo "Unit tests failed!"
echo "=============== UNIT TEST OUTPUT ==============="
tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt
echo "================================================"
exit 1
else
echo "Unit tests passed"
fi
echo "Python tests done"
# Example Tests
echo 'Running Example tests'
cd ${{ env.PROJECT_DIR }}/example
rm -rf build
cmake -B build -DENABLE_ESMI_LIB=OFF
make -C build -j $(nproc)
cd build
./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed'
./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed'
echo "Example tests done"
- name: AMDSMI Test Results
if: always()
run: |
echo "Displaying AMDSMI test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}"
- name: Integration Test Results
if: always()
run: |
echo "Displaying Integration test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}"
- name: Unit Test Results
if: always()
run: |
echo "Displaying Unit Test Results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}"
- name: Example DRM Test Results
if: always()
run: |
echo "Displaying Example DRM test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}"
- name: Example NoDRM Test Results
if: always()
run: |
echo "Displaying Example NoDRM test results for ${{ matrix.os }}"
cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}"
@@ -0,0 +1,319 @@
name: Auto Label PRs
on:
pull_request:
types: [opened, synchronize, reopened, closed]
workflow_run:
workflows: ["ABI Compliance Check"]
types: [completed]
jobs:
apply-labels:
runs-on: ubuntu-22.04
permissions:
pull-requests: write
actions: read
contents: read
steps:
- name: Add/Remove labels based on branch names and ABI results
uses: actions/github-script@v6
with:
script: |
const pr = context.payload.pull_request;
let prNumber, headSha, baseBranch, headBranch;
// Handle different event types
if (context.eventName === 'pull_request') {
prNumber = pr.number;
headSha = pr.head.sha;
baseBranch = pr.base.ref;
headBranch = pr.head.ref;
} else if (context.eventName === 'workflow_run') {
// Find the associated PR for workflow_run events
const workflowRun = context.payload.workflow_run;
console.log(`Workflow run completed: ${workflowRun.name} with conclusion: ${workflowRun.conclusion}`);
if (workflowRun.event !== 'pull_request') {
console.log('Workflow run was not triggered by a pull request, skipping');
return;
}
const prs = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
head: `${context.repo.owner}:${workflowRun.head_branch}`
});
const associatedPr = prs.data.find(p => p.head.sha === workflowRun.head_sha);
if (!associatedPr) {
console.log('No associated PR found for this workflow run');
return;
}
prNumber = associatedPr.number;
headSha = associatedPr.head.sha;
baseBranch = associatedPr.base.ref;
headBranch = associatedPr.head.ref;
} else {
console.log('Unsupported event type');
return;
}
let labelsApplied = false;
// Debug information
console.log(`Processing PR #${prNumber}: Head: ${headBranch}, Base: ${baseBranch}`);
// Get current PR data to check existing labels
const { data: currentPr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: prNumber
});
const existingLabels = currentPr.labels.map(label => label.name);
// Condition 1: PR targeting amd-mainline
if (baseBranch === 'amd-mainline' && context.eventName === 'pull_request') {
const labelToAdd = 'Merge amd-mainline';
try {
if (!existingLabels.includes(labelToAdd)) {
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
labels: [labelToAdd]
});
console.log(`Added label "${labelToAdd}" to PR #${prNumber}`);
labelsApplied = true;
}
} catch (error) {
console.error(`Error adding label "${labelToAdd}": ${error.message}`);
}
}
// Condition 2: Cherry-pick based on head branch name or release target
if (context.eventName === 'pull_request') {
const isCherryPickHead = /cherry.*pick/i.test(headBranch);
const isReleaseTargetBase = baseBranch.startsWith('release/');
if (isCherryPickHead || isReleaseTargetBase) {
const labelToAdd = 'cherry-pick';
try {
if (!existingLabels.includes(labelToAdd)) {
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
labels: [labelToAdd]
});
console.log(`Added label "${labelToAdd}" to PR #${prNumber}`);
labelsApplied = true;
} else {
console.log(`Label "${labelToAdd}" already exists on PR #${prNumber}`);
}
} catch (error) {
console.error(`Error adding label "${labelToAdd}": ${error.message}`);
}
}
}
// ABI BREAKAGE LOGIC: Check on both workflow_run AND pull_request events
let shouldCheckABI = false;
let hasMajorAbiBreakage = false;
let hasMinorAbiBreakage = false;
if (context.eventName === 'workflow_run') {
// Handle workflow_run events (existing logic)
const workflowRun = context.payload.workflow_run;
if (workflowRun.name === 'ABI Compliance Check') {
shouldCheckABI = true;
console.log(`ABI Compliance Check completed with conclusion: ${workflowRun.conclusion}`);
try {
const { data: jobs } = await github.rest.actions.listJobsForWorkflowRun({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: workflowRun.id
});
// Check job conclusions for ABI breakage
for (const job of jobs.jobs) {
console.log(`Job: ${job.name}, Conclusion: ${job.conclusion}`);
if (job.name.includes('Major ABI') && job.conclusion === 'failure') {
hasMajorAbiBreakage = true;
console.log('Major ABI breakage detected from job failure');
}
if (job.name.includes('Minor ABI') && job.conclusion === 'failure') {
hasMinorAbiBreakage = true;
console.log('Minor ABI breakage detected from job failure');
}
}
// If workflow succeeded, no ABI breakage
if (workflowRun.conclusion === 'success') {
console.log('ABI Compliance Check succeeded - no ABI breakage');
hasMajorAbiBreakage = false;
hasMinorAbiBreakage = false;
}
} catch (error) {
console.log(`Could not fetch job details: ${error.message}`);
return;
}
}
} else if (context.eventName === 'pull_request') {
// NEW: Check if amdsmi.h has been reverted on PR events
const hasAbiLabels = existingLabels.includes('MAJOR ABI BREAKAGE') || existingLabels.includes('MINOR ABI BREAKAGE');
if (hasAbiLabels) {
console.log('PR has ABI labels, checking if amdsmi.h changes were reverted...');
shouldCheckABI = true;
try {
// Get the diff for amdsmi.h between base and head
const { data: comparison } = await github.rest.repos.compareCommits({
owner: context.repo.owner,
repo: context.repo.repo,
base: currentPr.base.sha,
head: currentPr.head.sha
});
// Check if amdsmi.h has any changes
const amdsmiFile = comparison.files?.find(file => file.filename === 'include/amd_smi/amdsmi.h');
if (!amdsmiFile) {
console.log('No changes to amdsmi.h found in this PR - removing ABI labels');
hasMajorAbiBreakage = false;
hasMinorAbiBreakage = false;
} else if (amdsmiFile.changes === 0) {
console.log('amdsmi.h file exists but has no changes - removing ABI labels');
hasMajorAbiBreakage = false;
hasMinorAbiBreakage = false;
} else {
console.log(`amdsmi.h has ${amdsmiFile.changes} changes - keeping existing ABI labels`);
// Keep existing labels since we can't determine ABI status without running the check
hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE');
hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE');
}
} catch (error) {
console.log(`Error checking file changes: ${error.message}`);
// If we can't check, preserve existing labels
hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE');
hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE');
}
}
}
// Manage ABI breakage labels (only if we determined ABI status)
if (shouldCheckABI) {
const abiLabels = {
'MAJOR ABI BREAKAGE': hasMajorAbiBreakage,
'MINOR ABI BREAKAGE': hasMinorAbiBreakage
};
const wasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE');
const wasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE');
for (const [labelName, shouldHaveLabel] of Object.entries(abiLabels)) {
const hasLabel = existingLabels.includes(labelName);
if (shouldHaveLabel && !hasLabel) {
// Add label
try {
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
labels: [labelName]
});
console.log(`✅ Added label "${labelName}" to PR #${prNumber}`);
labelsApplied = true;
} catch (error) {
console.error(`❌ Error adding label "${labelName}": ${error.message}`);
}
} else if (!shouldHaveLabel && hasLabel) {
// Remove label
try {
await github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
name: labelName
});
console.log(`🗑️ Removed label "${labelName}" from PR #${prNumber}`);
labelsApplied = true;
} catch (error) {
console.error(`❌ Error removing label "${labelName}": ${error.message}`);
}
}
}
// Add comments when ABI issues are detected or resolved
if (context.eventName === 'workflow_run') {
// Only add comments for workflow_run events (actual ABI check results)
if (hasMajorAbiBreakage && !wasMajorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '⚠️ **MAJOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report and fix any breaking changes.'
});
}
if (hasMinorAbiBreakage && !wasMinorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '⚠️ **MINOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report for details.'
});
}
if (!hasMajorAbiBreakage && wasMajorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '✅ **MAJOR ABI BREAKAGE resolved** - ABI compliance check is now passing!'
});
}
if (!hasMinorAbiBreakage && wasMinorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '✅ **MINOR ABI BREAKAGE resolved** - ABI compliance check is now passing!'
});
}
} else if (context.eventName === 'pull_request') {
// Add comment when labels are removed due to file reversion
if (!hasMajorAbiBreakage && wasMajorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '✅ **MAJOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.'
});
}
if (!hasMinorAbiBreakage && wasMinorAbiBreakage) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: '✅ **MINOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.'
});
}
}
}
if (!labelsApplied && context.eventName === 'pull_request') {
console.log(`PR #${prNumber} did not match criteria for automatic labeling by this workflow.`);
}
+2 -2
مشاهده پرونده
@@ -2,9 +2,9 @@ name: AMDSMI CI
on:
pull_request:
branches: [amd-staging, amd-mainline, release/rocm-rel-*]
branches: [develop, juwillia/ci-1.0]
push:
branches: [amd-staging, amd-mainline, release/rocm-rel-*]
branches: [develop, juwillia/ci-1.0]
workflow_dispatch:
permissions:
+1 -1
مشاهده پرونده
@@ -9,7 +9,7 @@ on:
jobs:
apply-labels:
runs-on: AMD-ROCm-Internal-dev1
runs-on: ubuntu-22.04
permissions:
pull-requests: write
actions: read