diff --git a/.github/workflows/amdsmi-build.yml b/.github/workflows/amdsmi-build.yml new file mode 100644 index 0000000000..e21fb98574 --- /dev/null +++ b/.github/workflows/amdsmi-build.yml @@ -0,0 +1,878 @@ +name: AMDSMI CI + +on: + pull_request: + branches: [develop] + paths: + - 'projects/amdsmi/**' + - '.github/workflows/amdsmi-build.yml' + push: + branches: [develop] + paths: + - 'projects/amdsmi/**' + - '.github/workflows/amdsmi-build.yml' + workflow_dispatch: + +permissions: + contents: read +env: + DEBIAN_FRONTEND: noninteractive + DEBCONF_NONINTERACTIVE_SEEN: true + BUILD_TYPE: Release + ROCM_DIR: /opt/rocm + +jobs: + debian-buildinstall: + name: Build + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: [Ubuntu20, Ubuntu22, Debian10] + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root + + steps: + - uses: actions/checkout@v4 + + - name: Set Project Directory + run: | + # Find the directory containing the main CMakeLists.txt for AMDSMI + TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) + + if [ -z "$TARGET_DIR" ]; then + echo "Could not find CMakeLists.txt in projects/amdsmi. Searching root..." + TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) + fi + + echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV + + - name: Update repositories for Debian10 + if: matrix.os == 'Debian10' + run: | + set -e + echo 'Updating repositories for Debian10 (archived)' + cat > /etc/apt/sources.list << EOF + deb http://archive.debian.org/debian buster main + deb http://archive.debian.org/debian-security buster/updates main + EOF + echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until + apt update + + - name: Build AMDSMI + run: | + set -e + echo 'Building on ${{ matrix.os }}' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }}..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + # Configure, build, and package + if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ + make -j $(nproc) 2>&1 | tee make.log && \ + make package 2>&1 | tee package.log; then + + # Parse and report warnings as GitHub annotations + echo "::group::Build Warnings" + grep -i "warning" cmake.log make.log package.log | while read -r line; do + echo "::warning::$line" + done + echo "::endgroup::" + + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Install AMDSMI + run: | + cd ${{ env.PROJECT_DIR }}/build + if [ "${{ matrix.os }}" != "Debian10" ]; then + apt update + fi + + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Installation attempt $i for ${{ matrix.os }}..." + if apt install -y ./amd-smi-lib*99999-local_amd64.deb; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + # Verify Installation + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on ${{ matrix.os }}' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Uninstall + if: always() + run: | + set -e + echo 'Uninstalling on ${{ matrix.os }}' + apt remove -y amd-smi-lib || true + rm -f /usr/local/bin/amd-smi + if [ -d /opt/rocm/share/amd_smi ]; then + echo '/opt/rocm/share/amd_smi exists. Removing.' + rm -rf /opt/rocm/share/amd_smi + fi + echo 'Uninstall done on ${{ matrix.os }}' + + debian-test: + name: Tests + needs: debian-buildinstall + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: [Ubuntu20, Ubuntu22, Debian10] + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root + + steps: + - uses: actions/checkout@v4 + + - name: Set Project Directory + run: | + TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) + if [ -z "$TARGET_DIR" ]; then + TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) + fi + echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV + + - name: Update repositories for Debian10 + if: matrix.os == 'Debian10' + run: | + set -e + echo 'Updating repositories for Debian10 (archived)' + cat > /etc/apt/sources.list << EOF + deb http://archive.debian.org/debian buster main + deb http://archive.debian.org/debian-security buster/updates main + EOF + echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until + apt update + + - name: Build and Install for Test + run: | + set -e + echo 'Building for test on ${{ matrix.os }}' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }} test..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + echo 'Installing for test on ${{ matrix.os }}' + for i in $(seq 1 $RETRIES); do + echo "Installation attempt $i for test on ${{ matrix.os }}..." + if apt install -y $BUILD_FOLDER/amd-smi-lib*99999-local_amd64.deb; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on ${{ matrix.os }}' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: AMDSMI Command Tests + shell: bash + run: | + set -e + echo "Running AMDSMI commands on ${{ matrix.os }}" + mkdir -p /tmp/test-results-${{ matrix.os }} + commands=( + "amd-smi version" + "amd-smi list" + "amd-smi static" + "amd-smi firmware" + "amd-smi ucode" + "amd-smi bad-pages" + "amd-smi metric" + "amd-smi process" + "amd-smi topology" + "amd-smi monitor" + "amd-smi dmon" + "amd-smi xgmi" + "amd-smi partition" + ) + for cmd in "${commands[@]}"; do + debug_cmd="$cmd --loglevel debug" + echo "Running: $debug_cmd" + if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then + echo "Command '$debug_cmd' failed." + cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log + exit 1 + else + echo "$debug_cmd passed." + fi + done + echo "AMDSMI commands done on ${{ matrix.os }}" + + - name: Upload AMDSMI Command Test Results + if: always() + uses: actions/upload-artifact@v4 + with: + name: amdsmi-command-tests-${{ matrix.os }} + path: /tmp/test-results-${{ matrix.os }} + + - name: Run AMDSMI, Python, and Example Tests + shell: bash + run: | + set -e + echo 'Running other tests on ${{ matrix.os }}' + + # AMDSMI Tests + echo 'Running AMDSMI tests' + cd /opt/rocm/share/amd_smi/tests + source amdsmitst.exclude + + AMDSMI_RETRIES=3 + for attempt in $(seq 1 $AMDSMI_RETRIES); do + echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." + if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then + echo "AMDSMI tests passed on attempt $attempt" + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests done" + break + else + TEST_EXIT_CODE=$? + echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" + if [ $attempt -eq $AMDSMI_RETRIES ]; then + echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests failed" + exit $TEST_EXIT_CODE + else + echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." + sleep $((2 * attempt)) + fi + fi + done + + # Python Tests + echo 'Running Python tests' + cd /opt/rocm/share/amd_smi/tests/python_unittest + echo "Running integration tests..." + if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then + echo "Integration tests failed!" + echo "=============== INTEGRATION TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt + echo "=======================================================" + exit 1 + else + echo "Integration tests passed" + fi + + echo "Running unit tests..." + if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then + echo "Unit tests failed!" + echo "=============== UNIT TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt + echo "================================================" + exit 1 + else + echo "Unit tests passed" + fi + + echo "Python tests done" + + # Example Tests + echo 'Running Example tests' + cd ${{ env.PROJECT_DIR }}/example + rm -rf build + cmake -B build -DENABLE_ESMI_LIB=OFF + make -C build -j $(nproc) + cd build + ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' + ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' + echo "Example tests done" + + - name: AMDSMI Test Results + if: always() + run: | + echo "Displaying AMDSMI test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" + + - name: Integration Test Results + if: always() + run: | + echo "Displaying Integration test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" + + - name: Unit Test Results + if: always() + run: | + echo "Displaying Unit Test Results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" + + - name: Example DRM Test Results + if: always() + run: | + echo "Displaying Example DRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" + + - name: Example NoDRM Test Results + if: always() + run: | + echo "Displaying Example NoDRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}" + + rpm-buildinstall: + name: Build + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: + - SLES + - RHEL8 + - RHEL9 + - RHEL10 + - AzureLinux3 + - AlmaLinux8 + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root + + steps: + - uses: actions/checkout@v4 + + - name: Set Project Directory + run: | + TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) + if [ -z "$TARGET_DIR" ]; then + TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) + fi + echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV + + - name: Set PkgMgr + run: | + set -e + case "${{ matrix.os }}" in + SLES) + echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV + ;; + esac + + - name: Add more_itertools + if: matrix.os == 'AzureLinux3' + run: | + set -e + echo 'Installing more_itertools on ${{ matrix.os }}' + python3 -m pip install more_itertools + + - name: Build AMDSMI(RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + set -e + echo 'Building on ${{ matrix.os }} with retries and QA_RPATHS' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + RETRIES=5 + + # Set QA_RPATHS to ignore empty (0x0010) and invalid (0x0002) RPATHs + export QA_RPATHS=$((0x0010 | 0x0002)) + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }} ..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Build AMDSMI + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + set -e + echo 'Building on ${{ matrix.os }}' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }}..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + # Capture build output to parse warnings + if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ + make -j $(nproc) 2>&1 | tee make.log && \ + make package 2>&1 | tee package.log; then + + # Parse and report warnings as GitHub annotations + echo "::group::Build Warnings" + grep -i "warning" cmake.log make.log package.log | while read -r line; do + echo "::warning::$line" + done + echo "::endgroup::" + + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Install AMDSMI(RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + cd ${{ env.PROJECT_DIR }}/build + dnf install python3-setuptools python3-wheel -y + + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "RHEL10: Installation attempt $i..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on RHEL10' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: Install AMDSMI + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + cd ${{ env.PROJECT_DIR }}/build + case ${{ env.PACKAGE_MANAGER }} in + zypper) + timeout 10m zypper --no-refresh --no-gpg-checks install -y ./amd-smi-lib-*99999-local*.rpm + ;; + dnf) + dnf install python3-setuptools python3-wheel -y + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Attempt $i: Installing AMDSMI package..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then + echo "AMDSMI package installed successfully." + break + else + echo "Installation failed on attempt $i. Retrying..." + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES attempts failed. Exiting." + exit 1 + fi + sleep 10 + fi + done + ;; + esac + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + # Verify Installation + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on ${{ matrix.os }}' + + - name: Uninstall + if: always() + run: | + set -e + echo 'Uninstalling on ${{ matrix.os }}' + case ${{ matrix.os }} in + SLES) + zypper remove -y amd-smi-lib || true + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + dnf remove -y amd-smi-lib || true + ;; + esac + rm -f /usr/local/bin/amd-smi + if [ -d /opt/rocm/share/amd_smi ]; then + echo '/opt/rocm/share/amd_smi exists. Removing.' + rm -rf /opt/rocm/share/amd_smi + fi + echo 'Uninstall done on ${{ matrix.os }}' + + rpm-test: + name: Tests + needs: [rpm-buildinstall, debian-test] + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: + - SLES + - RHEL8 + - RHEL9 + - RHEL10 + - AzureLinux3 + - AlmaLinux8 + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root + + steps: + - uses: actions/checkout@v4 + + - name: Set Project Directory + run: | + TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) + if [ -z "$TARGET_DIR" ]; then + TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) + fi + echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV + + - name: Set PkgMgr + run: | + set -e + case "${{ matrix.os }}" in + SLES) + echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV + ;; + esac + + - name: Add more_itertools + if: matrix.os == 'AzureLinux3' + run: | + set -e + echo 'Installing more_itertools on ${{ matrix.os }}' + python3 -m pip install more_itertools + + - name: Build and Install for Tests (RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + set -e + echo 'Building for test on RHEL10/AlmaLinux8 with retries and QA_RPATHS' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + RETRIES=5 + + # Set QA_RPATHS to ignore empty (0x0010 | 0x0002) RPATHs + export QA_RPATHS=$((0x0010 | 0x0002)) + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for RHEL10/AlmaLinux8 test..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + echo 'Installing for test on RHEL10/AlmaLinux8' + dnf install python3-setuptools python3-wheel -y + + for i in $(seq 1 $RETRIES); do + echo "RHEL10/AlmaLinux8: Installation attempt $i for test..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on RHEL10/AlmaLinux8' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: Build and Install for Tests + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + set -e + echo 'Building for test on ${{ matrix.os }}' + BUILD_FOLDER=${{ env.PROJECT_DIR }}/build + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON + make -j $(nproc) + make package + + echo 'Installing for test on ${{ matrix.os }}' + case ${{ env.PACKAGE_MANAGER }} in + zypper) + timeout 10m zypper --no-refresh --no-gpg-checks install -y $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm + ;; + dnf) + dnf install python3-setuptools python3-wheel -y + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Attempt $i: Installing..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then + echo "Install successful." + break + else + echo "Attempt $i failed. Retrying..." + if [ $i -eq $RETRIES ]; then + echo "All attempts failed." + exit 1 + fi + sleep 10 + fi + done + ;; + esac + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on ${{ matrix.os }}' + + - name: AMDSMI Command Tests + shell: bash + run: | + set -e + echo "Running AMDSMI commands on ${{ matrix.os }}" + mkdir -p /tmp/test-results-${{ matrix.os }} + commands=( + "amd-smi version" + "amd-smi list" + "amd-smi static" + "amd-smi firmware" + "amd-smi ucode" + "amd-smi bad-pages" + "amd-smi metric" + "amd-smi process" + "amd-smi topology" + "amd-smi monitor" + "amd-smi dmon" + "amd-smi xgmi" + "amd-smi partition" + ) + for cmd in "${commands[@]}"; do + debug_cmd="$cmd --loglevel debug" + echo "Running: $debug_cmd" + if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then + echo "Command '$debug_cmd' failed." + cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log + exit 1 + else + echo "$debug_cmd passed." + fi + done + echo "AMDSMI commands done on ${{ matrix.os }}" + + - name: Upload AMDSMI Command Test Results + if: always() + uses: actions/upload-artifact@v4 + with: + name: amdsmi-command-tests-${{ matrix.os }} + path: /tmp/test-results-${{ matrix.os }} + + - name: Run AMDSMI, Python, and Example Tests + shell: bash + run: | + set -e + echo 'Running other tests on ${{ matrix.os }}' + + # AMDSMI Tests + echo 'Running AMDSMI tests' + cd /opt/rocm/share/amd_smi/tests + source amdsmitst.exclude + + AMDSMI_RETRIES=3 + for attempt in $(seq 1 $AMDSMI_RETRIES); do + echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." + if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then + echo "AMDSMI tests passed on attempt $attempt" + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests done" + break + else + TEST_EXIT_CODE=$? + echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" + if [ $attempt -eq $AMDSMI_RETRIES ]; then + echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests failed" + exit $TEST_EXIT_CODE + else + echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." + sleep $((2 * attempt)) + fi + fi + done + + # Python Tests + echo 'Running Python tests' + cd /opt/rocm/share/amd_smi/tests/python_unittest + echo "Running integration tests..." + if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then + echo "Integration tests failed!" + echo "=============== INTEGRATION TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt + echo "=======================================================" + exit 1 + else + echo "Integration tests passed" + fi + + echo "Running unit tests..." + if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then + echo "Unit tests failed!" + echo "=============== UNIT TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt + echo "================================================" + exit 1 + else + echo "Unit tests passed" + fi + + echo "Python tests done" + + # Example Tests + echo 'Running Example tests' + cd ${{ env.PROJECT_DIR }}/example + rm -rf build + cmake -B build -DENABLE_ESMI_LIB=OFF + make -C build -j $(nproc) + cd build + ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' + ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' + echo "Example tests done" + + - name: AMDSMI Test Results + if: always() + run: | + echo "Displaying AMDSMI test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" + + - name: Integration Test Results + if: always() + run: | + echo "Displaying Integration test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" + + - name: Unit Test Results + if: always() + run: | + echo "Displaying Unit Test Results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" + + - name: Example DRM Test Results + if: always() + run: | + echo "Displaying Example DRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" + + - name: Example NoDRM Test Results + if: always() + run: | + echo "Displaying Example NoDRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}" diff --git a/.github/workflows/auto-label.yml b/.github/workflows/auto-label.yml new file mode 100644 index 0000000000..a61b86c37d --- /dev/null +++ b/.github/workflows/auto-label.yml @@ -0,0 +1,319 @@ +name: Auto Label PRs + +on: + pull_request: + types: [opened, synchronize, reopened, closed] + workflow_run: + workflows: ["ABI Compliance Check"] + types: [completed] + +jobs: + apply-labels: + runs-on: ubuntu-22.04 + permissions: + pull-requests: write + actions: read + contents: read + steps: + - name: Add/Remove labels based on branch names and ABI results + uses: actions/github-script@v6 + with: + script: | + const pr = context.payload.pull_request; + let prNumber, headSha, baseBranch, headBranch; + + // Handle different event types + if (context.eventName === 'pull_request') { + prNumber = pr.number; + headSha = pr.head.sha; + baseBranch = pr.base.ref; + headBranch = pr.head.ref; + } else if (context.eventName === 'workflow_run') { + // Find the associated PR for workflow_run events + const workflowRun = context.payload.workflow_run; + console.log(`Workflow run completed: ${workflowRun.name} with conclusion: ${workflowRun.conclusion}`); + + if (workflowRun.event !== 'pull_request') { + console.log('Workflow run was not triggered by a pull request, skipping'); + return; + } + + const prs = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + head: `${context.repo.owner}:${workflowRun.head_branch}` + }); + + const associatedPr = prs.data.find(p => p.head.sha === workflowRun.head_sha); + + if (!associatedPr) { + console.log('No associated PR found for this workflow run'); + return; + } + + prNumber = associatedPr.number; + headSha = associatedPr.head.sha; + baseBranch = associatedPr.base.ref; + headBranch = associatedPr.head.ref; + } else { + console.log('Unsupported event type'); + return; + } + + let labelsApplied = false; + + // Debug information + console.log(`Processing PR #${prNumber}: Head: ${headBranch}, Base: ${baseBranch}`); + + // Get current PR data to check existing labels + const { data: currentPr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber + }); + const existingLabels = currentPr.labels.map(label => label.name); + + // Condition 1: PR targeting amd-mainline + if (baseBranch === 'amd-mainline' && context.eventName === 'pull_request') { + const labelToAdd = 'Merge amd-mainline'; + try { + if (!existingLabels.includes(labelToAdd)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelToAdd] + }); + console.log(`Added label "${labelToAdd}" to PR #${prNumber}`); + labelsApplied = true; + } + } catch (error) { + console.error(`Error adding label "${labelToAdd}": ${error.message}`); + } + } + + // Condition 2: Cherry-pick based on head branch name or release target + if (context.eventName === 'pull_request') { + const isCherryPickHead = /cherry.*pick/i.test(headBranch); + const isReleaseTargetBase = baseBranch.startsWith('release/'); + + if (isCherryPickHead || isReleaseTargetBase) { + const labelToAdd = 'cherry-pick'; + try { + if (!existingLabels.includes(labelToAdd)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelToAdd] + }); + console.log(`Added label "${labelToAdd}" to PR #${prNumber}`); + labelsApplied = true; + } else { + console.log(`Label "${labelToAdd}" already exists on PR #${prNumber}`); + } + } catch (error) { + console.error(`Error adding label "${labelToAdd}": ${error.message}`); + } + } + } + + // ABI BREAKAGE LOGIC: Check on both workflow_run AND pull_request events + let shouldCheckABI = false; + let hasMajorAbiBreakage = false; + let hasMinorAbiBreakage = false; + + if (context.eventName === 'workflow_run') { + // Handle workflow_run events (existing logic) + const workflowRun = context.payload.workflow_run; + + if (workflowRun.name === 'ABI Compliance Check') { + shouldCheckABI = true; + console.log(`ABI Compliance Check completed with conclusion: ${workflowRun.conclusion}`); + + try { + const { data: jobs } = await github.rest.actions.listJobsForWorkflowRun({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: workflowRun.id + }); + + // Check job conclusions for ABI breakage + for (const job of jobs.jobs) { + console.log(`Job: ${job.name}, Conclusion: ${job.conclusion}`); + + if (job.name.includes('Major ABI') && job.conclusion === 'failure') { + hasMajorAbiBreakage = true; + console.log('Major ABI breakage detected from job failure'); + } + + if (job.name.includes('Minor ABI') && job.conclusion === 'failure') { + hasMinorAbiBreakage = true; + console.log('Minor ABI breakage detected from job failure'); + } + } + + // If workflow succeeded, no ABI breakage + if (workflowRun.conclusion === 'success') { + console.log('ABI Compliance Check succeeded - no ABI breakage'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } + + } catch (error) { + console.log(`Could not fetch job details: ${error.message}`); + return; + } + } + } else if (context.eventName === 'pull_request') { + // NEW: Check if amdsmi.h has been reverted on PR events + const hasAbiLabels = existingLabels.includes('MAJOR ABI BREAKAGE') || existingLabels.includes('MINOR ABI BREAKAGE'); + + if (hasAbiLabels) { + console.log('PR has ABI labels, checking if amdsmi.h changes were reverted...'); + shouldCheckABI = true; + + try { + // Get the diff for amdsmi.h between base and head + const { data: comparison } = await github.rest.repos.compareCommits({ + owner: context.repo.owner, + repo: context.repo.repo, + base: currentPr.base.sha, + head: currentPr.head.sha + }); + + // Check if amdsmi.h has any changes + const amdsmiFile = comparison.files?.find(file => file.filename === 'include/amd_smi/amdsmi.h'); + + if (!amdsmiFile) { + console.log('No changes to amdsmi.h found in this PR - removing ABI labels'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } else if (amdsmiFile.changes === 0) { + console.log('amdsmi.h file exists but has no changes - removing ABI labels'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } else { + console.log(`amdsmi.h has ${amdsmiFile.changes} changes - keeping existing ABI labels`); + // Keep existing labels since we can't determine ABI status without running the check + hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + } + + } catch (error) { + console.log(`Error checking file changes: ${error.message}`); + // If we can't check, preserve existing labels + hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + } + } + } + + // Manage ABI breakage labels (only if we determined ABI status) + if (shouldCheckABI) { + const abiLabels = { + 'MAJOR ABI BREAKAGE': hasMajorAbiBreakage, + 'MINOR ABI BREAKAGE': hasMinorAbiBreakage + }; + + const wasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + const wasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + + for (const [labelName, shouldHaveLabel] of Object.entries(abiLabels)) { + const hasLabel = existingLabels.includes(labelName); + + if (shouldHaveLabel && !hasLabel) { + // Add label + try { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelName] + }); + console.log(`✅ Added label "${labelName}" to PR #${prNumber}`); + labelsApplied = true; + } catch (error) { + console.error(`❌ Error adding label "${labelName}": ${error.message}`); + } + } else if (!shouldHaveLabel && hasLabel) { + // Remove label + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + name: labelName + }); + console.log(`🗑️ Removed label "${labelName}" from PR #${prNumber}`); + labelsApplied = true; + } catch (error) { + console.error(`❌ Error removing label "${labelName}": ${error.message}`); + } + } + } + + // Add comments when ABI issues are detected or resolved + if (context.eventName === 'workflow_run') { + // Only add comments for workflow_run events (actual ABI check results) + if (hasMajorAbiBreakage && !wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '⚠️ **MAJOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report and fix any breaking changes.' + }); + } + + if (hasMinorAbiBreakage && !wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '⚠️ **MINOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report for details.' + }); + } + + if (!hasMajorAbiBreakage && wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MAJOR ABI BREAKAGE resolved** - ABI compliance check is now passing!' + }); + } + + if (!hasMinorAbiBreakage && wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MINOR ABI BREAKAGE resolved** - ABI compliance check is now passing!' + }); + } + } else if (context.eventName === 'pull_request') { + // Add comment when labels are removed due to file reversion + if (!hasMajorAbiBreakage && wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MAJOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.' + }); + } + + if (!hasMinorAbiBreakage && wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MINOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.' + }); + } + } + } + + if (!labelsApplied && context.eventName === 'pull_request') { + console.log(`PR #${prNumber} did not match criteria for automatic labeling by this workflow.`); + } \ No newline at end of file diff --git a/projects/amdsmi/.github/workflows/amdsmi-build.yml b/projects/amdsmi/.github/workflows/amdsmi-build.yml index ef8c20bdb2..b63c011502 100644 --- a/projects/amdsmi/.github/workflows/amdsmi-build.yml +++ b/projects/amdsmi/.github/workflows/amdsmi-build.yml @@ -2,9 +2,9 @@ name: AMDSMI CI on: pull_request: - branches: [amd-staging, amd-mainline, release/rocm-rel-*] + branches: [develop, juwillia/ci-1.0] push: - branches: [amd-staging, amd-mainline, release/rocm-rel-*] + branches: [develop, juwillia/ci-1.0] workflow_dispatch: permissions: diff --git a/projects/amdsmi/.github/workflows/auto-label.yml b/projects/amdsmi/.github/workflows/auto-label.yml index 40da019e38..03eb05d638 100644 --- a/projects/amdsmi/.github/workflows/auto-label.yml +++ b/projects/amdsmi/.github/workflows/auto-label.yml @@ -9,7 +9,7 @@ on: jobs: apply-labels: - runs-on: AMD-ROCm-Internal-dev1 + runs-on: ubuntu-22.04 permissions: pull-requests: write actions: read