name: AMDSMI CI on: pull_request: branches: [develop] paths: - 'projects/amdsmi/**' - '.github/workflows/amdsmi-build.yml' push: branches: [develop] paths: - 'projects/amdsmi/**' - '.github/workflows/amdsmi-build.yml' workflow_dispatch: permissions: contents: read env: DEBIAN_FRONTEND: noninteractive DEBCONF_NONINTERACTIVE_SEEN: true BUILD_TYPE: Release ROCM_DIR: /opt/rocm jobs: debian-buildinstall: name: Build runs-on: - self-hosted - ${{ vars.RUNNER_TYPE }} continue-on-error: true strategy: max-parallel: 10 matrix: os: [Ubuntu20, Ubuntu22, Debian10] container: image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root steps: - uses: actions/checkout@v4 - name: Set Artifact Metadata if: github.event_name == 'pull_request' run: | # Set PR number and date for artifact naming echo "PR_NUMBER=PR${{ github.event.pull_request.number }}" >> $GITHUB_ENV # Set date in MMDDYY-HHMM format (UTC time) echo "BUILD_DATE=$(date -u +%m%d%y-%H%M)" >> $GITHUB_ENV - name: Set Project Directory run: | # Find the directory containing the main CMakeLists.txt for AMDSMI TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) if [ -z "$TARGET_DIR" ]; then echo "Could not find CMakeLists.txt in projects/amdsmi. Searching root..." TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) fi echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV - name: Update repositories for Debian10 if: matrix.os == 'Debian10' run: | set -e echo 'Updating repositories for Debian10 (archived)' cat > /etc/apt/sources.list << EOF deb http://archive.debian.org/debian buster main deb http://archive.debian.org/debian-security buster/updates main EOF echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until apt update - name: Build AMDSMI run: | set -e echo 'Building on ${{ matrix.os }}' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Build attempt $i for ${{ matrix.os }}..." rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER # Configure, build, and package if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ make -j $(nproc) 2>&1 | tee make.log && \ make package 2>&1 | tee package.log; then # Parse and report warnings as GitHub annotations echo "::group::Build Warnings" grep -i "warning" cmake.log make.log package.log | while read -r line; do echo "::warning::$line" done echo "::endgroup::" echo "Build successful on attempt $i" break else echo "Build failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES build attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo "Build completed on ${{ matrix.os }}" - name: Upload Debian Package Artifacts if: github.event_name == 'pull_request' uses: actions/upload-artifact@v4 with: name: amd-smi-lib-deb-${{ matrix.os }}-${{ env.PR_NUMBER }}-${{ env.BUILD_DATE }} path: ${{ env.PROJECT_DIR }}/build/amd-smi-lib*99999-local_amd64.deb if-no-files-found: warn retention-days: 7 - name: Install AMDSMI run: | cd ${{ env.PROJECT_DIR }}/build if [ "${{ matrix.os }}" != "Debian10" ]; then apt update fi RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Installation attempt $i for ${{ matrix.os }}..." if apt install -y ./amd-smi-lib*99999-local_amd64.deb; then echo "Installation successful on attempt $i" ln -s /opt/rocm/bin/amd-smi /usr/local/bin # Verify Installation echo 'Verifying installation:' amd-smi version python3 -m pip list | grep amd python3 -m pip list | grep pip python3 -m pip list | grep setuptools echo 'Completed installation on ${{ matrix.os }}' break else echo "Installation failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES installation attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo "Build completed on ${{ matrix.os }}" - name: Uninstall if: always() run: | set -e echo 'Uninstalling on ${{ matrix.os }}' apt remove -y amd-smi-lib || true rm -f /usr/local/bin/amd-smi if [ -d /opt/rocm/share/amd_smi ]; then echo '/opt/rocm/share/amd_smi exists. Removing.' rm -rf /opt/rocm/share/amd_smi fi echo 'Uninstall done on ${{ matrix.os }}' debian-test: name: Tests needs: debian-buildinstall runs-on: - self-hosted - ${{ vars.RUNNER_TYPE }} continue-on-error: true strategy: max-parallel: 10 matrix: os: [Ubuntu20, Ubuntu22, Debian10] container: image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root steps: - uses: actions/checkout@v4 - name: Set Project Directory run: | TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) if [ -z "$TARGET_DIR" ]; then TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) fi echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV - name: Update repositories for Debian10 if: matrix.os == 'Debian10' run: | set -e echo 'Updating repositories for Debian10 (archived)' cat > /etc/apt/sources.list << EOF deb http://archive.debian.org/debian buster main deb http://archive.debian.org/debian-security buster/updates main EOF echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until apt update - name: Build and Install for Test run: | set -e echo 'Building for test on ${{ matrix.os }}' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Build attempt $i for ${{ matrix.os }} test..." rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ make -j $(nproc) && \ make package; then echo "Build successful on attempt $i" break else echo "Build failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES build attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo 'Installing for test on ${{ matrix.os }}' for i in $(seq 1 $RETRIES); do echo "Installation attempt $i for test on ${{ matrix.os }}..." if apt install -y $BUILD_FOLDER/amd-smi-lib*99999-local_amd64.deb; then echo "Installation successful on attempt $i" ln -s /opt/rocm/bin/amd-smi /usr/local/bin echo 'Install done for test on ${{ matrix.os }}' break else echo "Installation failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES installation attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done - name: AMDSMI Command Tests shell: bash run: | set -e echo "Running AMDSMI commands on ${{ matrix.os }}" mkdir -p /tmp/test-results-${{ matrix.os }} commands=( "amd-smi version" "amd-smi list" "amd-smi static" "amd-smi firmware" "amd-smi ucode" "amd-smi bad-pages" "amd-smi metric" "amd-smi process" "amd-smi topology" "amd-smi monitor" "amd-smi dmon" "amd-smi xgmi" "amd-smi partition" ) for cmd in "${commands[@]}"; do debug_cmd="$cmd --loglevel debug" echo "Running: $debug_cmd" if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then echo "Command '$debug_cmd' failed." cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log exit 1 else echo "$debug_cmd passed." fi done echo "AMDSMI commands done on ${{ matrix.os }}" - name: Upload AMDSMI Command Test Results if: always() uses: actions/upload-artifact@v4 with: name: amdsmi-command-tests-${{ matrix.os }} path: /tmp/test-results-${{ matrix.os }} - name: Run AMDSMI, Python, and Example Tests shell: bash run: | set -e echo 'Running other tests on ${{ matrix.os }}' # AMDSMI Tests echo 'Running AMDSMI tests' cd /opt/rocm/share/amd_smi/tests source amdsmitst.exclude AMDSMI_RETRIES=3 for attempt in $(seq 1 $AMDSMI_RETRIES); do echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then echo "AMDSMI tests passed on attempt $attempt" echo "=============== TEST OUTPUT ===============" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" echo "==============================================" echo "AMDSMI tests done" break else TEST_EXIT_CODE=$? echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" if [ $attempt -eq $AMDSMI_RETRIES ]; then echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." echo "=============== TEST OUTPUT ===============" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" echo "==============================================" echo "AMDSMI tests failed" exit $TEST_EXIT_CODE else echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." sleep $((2 * attempt)) fi fi done # Python Tests echo 'Running Python tests' cd /opt/rocm/share/amd_smi/tests/python_unittest echo "Running integration tests..." if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then echo "Integration tests failed!" echo "=============== INTEGRATION TEST OUTPUT ===============" tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt echo "=======================================================" exit 1 else echo "Integration tests passed" fi echo "Running unit tests..." if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then echo "Unit tests failed!" echo "=============== UNIT TEST OUTPUT ===============" tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt echo "================================================" exit 1 else echo "Unit tests passed" fi echo "Python tests done" # Example Tests echo 'Running Example tests' cd ${{ env.PROJECT_DIR }}/example rm -rf build cmake -B build -DENABLE_ESMI_LIB=OFF make -C build -j $(nproc) cd build ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' echo "Example tests done" - name: AMDSMI Test Results if: always() run: | echo "Displaying AMDSMI test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" - name: Integration Test Results if: always() run: | echo "Displaying Integration test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" - name: Unit Test Results if: always() run: | echo "Displaying Unit Test Results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" - name: Example DRM Test Results if: always() run: | echo "Displaying Example DRM test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" - name: Example NoDRM Test Results if: always() run: | echo "Displaying Example NoDRM test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}" rpm-buildinstall: name: Build runs-on: - self-hosted - ${{ vars.RUNNER_TYPE }} continue-on-error: true strategy: max-parallel: 10 matrix: os: - SLES - RHEL8 - RHEL9 - RHEL10 - AzureLinux3 - AlmaLinux8 container: image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root steps: - uses: actions/checkout@v4 - name: Set Artifact Metadata if: github.event_name == 'pull_request' run: | # Set PR number and date for artifact naming echo "PR_NUMBER=PR${{ github.event.pull_request.number }}" >> $GITHUB_ENV # Set date in MMDDYY-HHMM format (UTC time) echo "BUILD_DATE=$(date -u +%m%d%y-%H%M)" >> $GITHUB_ENV - name: Set Project Directory run: | TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) if [ -z "$TARGET_DIR" ]; then TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) fi echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV - name: Set PkgMgr run: | set -e case "${{ matrix.os }}" in SLES) echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV ;; RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV ;; esac - name: Add more_itertools if: matrix.os == 'AzureLinux3' run: | set -e echo 'Installing more_itertools on ${{ matrix.os }}' python3 -m pip install more_itertools - name: Build AMDSMI(RHEL10 & AlmaLinux8) if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' run: | set -e echo 'Building on ${{ matrix.os }} with retries and QA_RPATHS' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build RETRIES=5 # Set QA_RPATHS to ignore empty (0x0010) and invalid (0x0002) RPATHs export QA_RPATHS=$((0x0010 | 0x0002)) for i in $(seq 1 $RETRIES); do echo "Build attempt $i for ${{ matrix.os }} ..." rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ make -j $(nproc) && \ make package; then echo "Build successful on attempt $i" break else echo "Build failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES build attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo "Build completed on ${{ matrix.os }}" - name: Upload RPM Package Artifacts (RHEL10 & AlmaLinux8) if: github.event_name == 'pull_request' && (matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8') uses: actions/upload-artifact@v4 with: name: amd-smi-lib-rpm-${{ matrix.os }}-${{ env.PR_NUMBER }}-${{ env.BUILD_DATE }} path: ${{ env.PROJECT_DIR }}/build/amd-smi-lib-*99999-local*.rpm if-no-files-found: warn retention-days: 7 - name: Build AMDSMI if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' run: | set -e echo 'Building on ${{ matrix.os }}' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Build attempt $i for ${{ matrix.os }}..." rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER # Capture build output to parse warnings if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ make -j $(nproc) 2>&1 | tee make.log && \ make package 2>&1 | tee package.log; then # Parse and report warnings as GitHub annotations echo "::group::Build Warnings" grep -i "warning" cmake.log make.log package.log | while read -r line; do echo "::warning::$line" done echo "::endgroup::" echo "Build successful on attempt $i" break else echo "Build failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES build attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo "Build completed on ${{ matrix.os }}" - name: Upload RPM Package Artifacts if: github.event_name == 'pull_request' && matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' uses: actions/upload-artifact@v4 with: name: amd-smi-lib-rpm-${{ matrix.os }}-${{ env.PR_NUMBER }}-${{ env.BUILD_DATE }} path: ${{ env.PROJECT_DIR }}/build/amd-smi-lib-*99999-local*.rpm if-no-files-found: warn retention-days: 7 - name: Install AMDSMI(RHEL10 & AlmaLinux8) if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' run: | cd ${{ env.PROJECT_DIR }}/build dnf install python3-setuptools python3-wheel -y RETRIES=3 for i in $(seq 1 $RETRIES); do echo "RHEL10: Installation attempt $i..." if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then echo "Installation successful on attempt $i" ln -s /opt/rocm/bin/amd-smi /usr/local/bin echo 'Verifying installation:' amd-smi version python3 -m pip list | grep amd python3 -m pip list | grep pip python3 -m pip list | grep setuptools echo 'Completed installation on RHEL10' break else echo "Installation failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES installation attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done - name: Install AMDSMI if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' run: | cd ${{ env.PROJECT_DIR }}/build case ${{ env.PACKAGE_MANAGER }} in zypper) timeout 10m zypper --no-refresh --no-gpg-checks install -y ./amd-smi-lib-*99999-local*.rpm ;; dnf) dnf install python3-setuptools python3-wheel -y RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Attempt $i: Installing AMDSMI package..." if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then echo "AMDSMI package installed successfully." break else echo "Installation failed on attempt $i. Retrying..." if [ $i -eq $RETRIES ]; then echo "All $RETRIES attempts failed. Exiting." exit 1 fi sleep 10 fi done ;; esac ln -s /opt/rocm/bin/amd-smi /usr/local/bin # Verify Installation echo 'Verifying installation:' amd-smi version python3 -m pip list | grep amd python3 -m pip list | grep pip python3 -m pip list | grep setuptools echo 'Completed installation on ${{ matrix.os }}' - name: Uninstall if: always() run: | set -e echo 'Uninstalling on ${{ matrix.os }}' case ${{ matrix.os }} in SLES) zypper remove -y amd-smi-lib || true ;; RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) dnf remove -y amd-smi-lib || true ;; esac rm -f /usr/local/bin/amd-smi if [ -d /opt/rocm/share/amd_smi ]; then echo '/opt/rocm/share/amd_smi exists. Removing.' rm -rf /opt/rocm/share/amd_smi fi echo 'Uninstall done on ${{ matrix.os }}' rpm-test: name: Tests needs: [rpm-buildinstall, debian-test] runs-on: - self-hosted - ${{ vars.RUNNER_TYPE }} continue-on-error: true strategy: max-parallel: 10 matrix: os: - SLES - RHEL8 - RHEL9 - RHEL10 - AzureLinux3 - AlmaLinux8 container: image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules -u root steps: - uses: actions/checkout@v4 - name: Set Project Directory run: | TARGET_DIR=$(find $GITHUB_WORKSPACE -path "*/projects/amdsmi/CMakeLists.txt" -exec dirname {} \;) if [ -z "$TARGET_DIR" ]; then TARGET_DIR=$(find $GITHUB_WORKSPACE -maxdepth 2 -name "CMakeLists.txt" -exec dirname {} \; | head -n 1) fi echo "PROJECT_DIR=$TARGET_DIR" >> $GITHUB_ENV - name: Set PkgMgr run: | set -e case "${{ matrix.os }}" in SLES) echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV ;; RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV ;; esac - name: Add more_itertools if: matrix.os == 'AzureLinux3' run: | set -e echo 'Installing more_itertools on ${{ matrix.os }}' python3 -m pip install more_itertools - name: Build and Install for Tests (RHEL10 & AlmaLinux8) if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' run: | set -e echo 'Building for test on RHEL10/AlmaLinux8 with retries and QA_RPATHS' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build RETRIES=5 # Set QA_RPATHS to ignore empty (0x0010 | 0x0002) RPATHs export QA_RPATHS=$((0x0010 | 0x0002)) for i in $(seq 1 $RETRIES); do echo "Build attempt $i for RHEL10/AlmaLinux8 test..." rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER if cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ make -j $(nproc) && \ make package; then echo "Build successful on attempt $i" break else echo "Build failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES build attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done echo 'Installing for test on RHEL10/AlmaLinux8' dnf install python3-setuptools python3-wheel -y for i in $(seq 1 $RETRIES); do echo "RHEL10/AlmaLinux8: Installation attempt $i for test..." if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then echo "Installation successful on attempt $i" ln -s /opt/rocm/bin/amd-smi /usr/local/bin echo 'Install done for test on RHEL10/AlmaLinux8' break else echo "Installation failed on attempt $i" if [ $i -eq $RETRIES ]; then echo "All $RETRIES installation attempts failed. Exiting." exit 1 fi sleep $((2 * i)) fi done - name: Build and Install for Tests if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' run: | set -e echo 'Building for test on ${{ matrix.os }}' BUILD_FOLDER=${{ env.PROJECT_DIR }}/build rm -rf $BUILD_FOLDER mkdir -p $BUILD_FOLDER cd $BUILD_FOLDER cmake ${{ env.PROJECT_DIR }} -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON make -j $(nproc) make package echo 'Installing for test on ${{ matrix.os }}' case ${{ env.PACKAGE_MANAGER }} in zypper) timeout 10m zypper --no-refresh --no-gpg-checks install -y $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm ;; dnf) dnf install python3-setuptools python3-wheel -y RETRIES=3 for i in $(seq 1 $RETRIES); do echo "Attempt $i: Installing..." if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then echo "Install successful." break else echo "Attempt $i failed. Retrying..." if [ $i -eq $RETRIES ]; then echo "All attempts failed." exit 1 fi sleep 10 fi done ;; esac ln -s /opt/rocm/bin/amd-smi /usr/local/bin echo 'Install done for test on ${{ matrix.os }}' - name: AMDSMI Command Tests shell: bash run: | set -e echo "Running AMDSMI commands on ${{ matrix.os }}" mkdir -p /tmp/test-results-${{ matrix.os }} commands=( "amd-smi version" "amd-smi list" "amd-smi static" "amd-smi firmware" "amd-smi ucode" "amd-smi bad-pages" "amd-smi metric" "amd-smi process" "amd-smi topology" "amd-smi monitor" "amd-smi dmon" "amd-smi xgmi" "amd-smi partition" ) for cmd in "${commands[@]}"; do debug_cmd="$cmd --loglevel debug" echo "Running: $debug_cmd" if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then echo "Command '$debug_cmd' failed." cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log exit 1 else echo "$debug_cmd passed." fi done echo "AMDSMI commands done on ${{ matrix.os }}" - name: Upload AMDSMI Command Test Results if: always() uses: actions/upload-artifact@v4 with: name: amdsmi-command-tests-${{ matrix.os }} path: /tmp/test-results-${{ matrix.os }} - name: Run AMDSMI, Python, and Example Tests shell: bash run: | set -e echo 'Running other tests on ${{ matrix.os }}' # AMDSMI Tests echo 'Running AMDSMI tests' cd /opt/rocm/share/amd_smi/tests source amdsmitst.exclude AMDSMI_RETRIES=3 for attempt in $(seq 1 $AMDSMI_RETRIES); do echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then echo "AMDSMI tests passed on attempt $attempt" echo "=============== TEST OUTPUT ===============" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" echo "==============================================" echo "AMDSMI tests done" break else TEST_EXIT_CODE=$? echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" if [ $attempt -eq $AMDSMI_RETRIES ]; then echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." echo "=============== TEST OUTPUT ===============" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" echo "==============================================" echo "AMDSMI tests failed" exit $TEST_EXIT_CODE else echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." sleep $((2 * attempt)) fi fi done # Python Tests echo 'Running Python tests' cd /opt/rocm/share/amd_smi/tests/python_unittest echo "Running integration tests..." if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then echo "Integration tests failed!" echo "=============== INTEGRATION TEST OUTPUT ===============" tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt echo "=======================================================" exit 1 else echo "Integration tests passed" fi echo "Running unit tests..." if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then echo "Unit tests failed!" echo "=============== UNIT TEST OUTPUT ===============" tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt echo "================================================" exit 1 else echo "Unit tests passed" fi echo "Python tests done" # Example Tests echo 'Running Example tests' cd ${{ env.PROJECT_DIR }}/example rm -rf build cmake -B build -DENABLE_ESMI_LIB=OFF make -C build -j $(nproc) cd build ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' echo "Example tests done" - name: AMDSMI Test Results if: always() run: | echo "Displaying AMDSMI test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" - name: Integration Test Results if: always() run: | echo "Displaying Integration test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" - name: Unit Test Results if: always() run: | echo "Displaying Unit Test Results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" - name: Example DRM Test Results if: always() run: | echo "Displaying Example DRM test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" - name: Example NoDRM Test Results if: always() run: | echo "Displaying Example NoDRM test results for ${{ matrix.os }}" cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}"