[rocprofiler-systems] Add retries to RHEL install steps (#1384)

* Add GHCR retry logic

* Add retries to Install ROCm Packages step in rocprofiler-systems-redhat.yml

* Update containers-ci.yml file to use latest RHEL9/10 releases

* Use build-docker-ci script in rocprofiler-systems-containers

* Remove working-directory from step in rocprofiler-systems-redhat.yml

* Remove shell bash from Install ROCm Packages step

* Revert RHEL version change in rocprofiler-systems-redhat.yml
This commit is contained in:
Jason Bonnell
2025-10-17 10:20:54 -04:00
committed by GitHub
parent b9b8b6110b
commit 9664f1dc91
4 changed files with 78 additions and 53 deletions
@@ -69,40 +69,45 @@ jobs:
with:
username: ${{ secrets.ROCPROF_SYS_DOCKER_LOGIN }}
password: ${{ secrets.ROCPROF_SYS_DOCKER_TOKEN }}
- name: Set up Docker variables
id: setup_vars
run: |
if [ ${{ matrix.distro }} = "opensuse" ]; then
DISTRO_IMAGE="opensuse/leap"
elif [ ${{ matrix.distro }} = "rhel" ]; then
DISTRO_IMAGE="rockylinux/rockylinux"
else
DISTRO_IMAGE=${{ matrix.distro }}
fi
echo "distro_image=${DISTRO_IMAGE}" >> $GITHUB_OUTPUT
if [ ${{ matrix.distro }} = "debian" ]; then
DOCKER_FILE=Dockerfile.ubuntu.ci
else
DOCKER_FILE=Dockerfile.${{ matrix.distro }}.ci
fi
echo "docker_file=${DOCKER_FILE}" >> $GITHUB_OUTPUT
- name: Build CI Base Container (Does not Push on PR)
uses: docker/build-push-action@v6
- name: Build CI Container (PR - No Push)
if: github.event_name == 'pull_request'
timeout-minutes: 45
uses: nick-fields/retry@v3
with:
file: projects/rocprofiler-systems/docker/${{ steps.setup_vars.outputs.docker_file }}
platforms: linux/amd64
push: ${{ github.event_name != 'pull_request' }}
build-args: |
DISTRO=${{ steps.setup_vars.outputs.distro_image }}
VERSION=${{ matrix.version }}
NJOBS=2
ELFUTILS_DOWNLOAD_VERSION=0.188
BOOST_DOWNLOAD_VERSION=1.79.0
tags: |
${{ secrets.ROCPROF_SYS_DOCKER_LOGIN }}/rocprofiler-systems:ci-base-${{ matrix.distro }}-${{ matrix.version }}
retry_wait_seconds: 60
timeout_minutes: 45
max_attempts: 3
command: |
pushd projects/rocprofiler-systems/docker
./build-docker-ci.sh \
--distro ${{ matrix.distro }} \
--versions ${{ matrix.version }} \
--user ${{ secrets.ROCPROF_SYS_DOCKER_LOGIN }} \
--jobs 2 \
--elfutils-version 0.188 \
--boost-version 1.79.0
popd
- name: Build Base Container (Push)
if: github.event_name != 'pull_request'
timeout-minutes: 45
uses: nick-fields/retry@v3
with:
retry_wait_seconds: 60
timeout_minutes: 45
max_attempts: 3
command: |
pushd projects/rocprofiler-systems/docker
./build-docker-ci.sh \
--distro ${{ matrix.distro }} \
--versions ${{ matrix.version }} \
--user ${{ secrets.ROCPROF_SYS_DOCKER_LOGIN }} \
--jobs 2 \
--elfutils-version 0.188 \
--boost-version 1.79.0 \
--push
popd
prepare_matrix_release:
if: github.repository == 'ROCm/rocm-systems'
@@ -103,6 +103,29 @@ jobs:
images: ghcr.io/ROCm/rocprofiler-${{ matrix.system.distro }}
- name: Build CI GFX Container (Does not Push on PR)
id: docker_build
continue-on-error: true
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
with:
file: projects/rocprofiler-systems/docker/${{ steps.setup_vars_gfx.outputs.docker_file }}
platforms: linux/amd64
push: ${{ github.event_name != 'pull_request' }}
build-args: |
DISTRO=${{ steps.setup_vars_gfx.outputs.distro_image }}
VERSION=${{ matrix.system.version }}
TYPE=${{ matrix.gpu }}
GPU_TYPE=${{ matrix.gpu }}
GPU_TARBALL=${{ steps.therock.outputs.tarball }}
NJOBS=2
ELFUTILS_DOWNLOAD_VERSION=0.188
BOOST_DOWNLOAD_VERSION=1.79.0
tags: |
ghcr.io/rocm/rocprofiler-${{ matrix.system.distro }}:${{ matrix.system.version }}-systems-ci-${{ matrix.gpu }}
labels: ${{ steps.meta.outputs.labels }}
# Retry a copy of docker_build if Docker build failed due to intermittent failure
- name: Build CI GFX Container Retry (Does not Push on PR)
if: steps.docker_build.outcome != 'success'
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
with:
file: projects/rocprofiler-systems/docker/${{ steps.setup_vars_gfx.outputs.docker_file }}
@@ -78,19 +78,22 @@ jobs:
- name: Install ROCm Packages
if: ${{ matrix.rocm-version > 0 }}
timeout-minutes: 30
shell: bash
working-directory: projects/rocprofiler-systems/
run: |
RPM_TAG=".el${OS_VERSION_MAJOR}"
ROCM_VERSION=${{ matrix.rocm-version }}
ROCM_MAJOR=$(echo ${ROCM_VERSION} | sed 's/\./ /g' | awk '{print $1}')
ROCM_MINOR=$(echo ${ROCM_VERSION} | sed 's/\./ /g' | awk '{print $2}')
ROCM_VERSN=$(( (${ROCM_MAJOR}*10000)+(${ROCM_MINOR}*100) ))
if [ "${OS_VERSION_MAJOR}" -eq 8 ]; then PERL_REPO=powertools; else PERL_REPO=crb; fi
dnf -y --enablerepo=${PERL_REPO} install perl-File-BaseDir
yum install -y https://repo.radeon.com/amdgpu-install/${{ matrix.rocm-version }}/rhel/${{ matrix.os-release }}/amdgpu-install-${ROCM_MAJOR}.${ROCM_MINOR}.${ROCM_VERSN}-1${RPM_TAG}.noarch.rpm
yum install -y rocm-dev rocdecode-devel
if [ "${OS_VERSION_MAJOR}" -gt 8 ]; then dnf install -y libavcodec-free-devel libavformat-free-devel; fi
uses: nick-fields/retry@v3
with:
retry_wait_seconds: 30
timeout_minutes: 30
max_attempts: 3
command: |
RPM_TAG=".el${OS_VERSION_MAJOR}"
ROCM_VERSION=${{ matrix.rocm-version }}
ROCM_MAJOR=$(echo ${ROCM_VERSION} | sed 's/\./ /g' | awk '{print $1}')
ROCM_MINOR=$(echo ${ROCM_VERSION} | sed 's/\./ /g' | awk '{print $2}')
ROCM_VERSN=$(( (${ROCM_MAJOR}*10000)+(${ROCM_MINOR}*100) ))
if [ "${OS_VERSION_MAJOR}" -eq 8 ]; then PERL_REPO=powertools; else PERL_REPO=crb; fi
dnf -y --enablerepo=${PERL_REPO} install perl-File-BaseDir
yum install -y https://repo.radeon.com/amdgpu-install/${{ matrix.rocm-version }}/rhel/${{ matrix.os-release }}/amdgpu-install-${ROCM_MAJOR}.${ROCM_MINOR}.${ROCM_VERSN}-1${RPM_TAG}.noarch.rpm
yum install -y rocm-dev rocdecode-devel
if [ "${OS_VERSION_MAJOR}" -gt 8 ]; then dnf install -y libavcodec-free-devel libavformat-free-devel; fi
- name: Configure, Build, and Test
timeout-minutes: 115
@@ -19,13 +19,7 @@ matrix:
version: "8.10"
# RHEL9
- distro: "rhel"
version: "9.3"
- distro: "rhel"
version: "9.4"
- distro: "rhel"
version: "9.5"
- distro: "rhel"
version: "9.6"
version: "9"
# RHEL10
- distro: "rhel"
version: "10.0"
version: "10"