[TheRock CI] Adding TheRock RCCL tests (#1873)

* First commit for rccl multi node test workflow

* Adding workflow dispatch

* Added branch based pull trigger

* Changed typo in branch name

* Add input variables to push

* Removed input variables to push

* Added self hosted runner for Vultr cloud

* Skipping build and only running test

* Changed test runner label name

* Made changes to executable paths in test script

* Made changes to run

* Made changes to cd into cvs dir

* This is a dummy commit

* Added cmake options

* Modified build options

* Commiting build changes

* Adding rccl and rccl-tests

* Re-ordering rccl and rccl-tests

* adding --global command

* modified cmake command

* modified script paths

* Testing OIDC for rccl repo

* Testing OIDC for rccl repo

* Testing build and upload workflow

* use default env variable for AMDGPU families on push workflow trigger

* Adding cleanup and correct role

* Adding additional yml files

* Fixing typo';

* Adding new sha

* Adding correct gpu target

* Adding back venv bin activate

* Adding workflow dispatch for tests

* Testing

* Adding cat

* Adding cat

* Adding rocm dir change

* Adding checkout

* cat with sudo

* rccl checkout

* correcting branch

* removing sudo

* trying to adjust correct path'

* Adding output dir path

* Use docker container with pre-installed MPI

* Adding back build steps

* Fixing SHA

* Adding exclusion logic:

* Adding test

* Adding CI check

* Removing testing

* Limit to build only rccl, rccl-tests and required dependencies

* Adding test

* Removing test

* Removing quote

* Reverting test

* PR comments

---------

Co-authored-by: arravikum <arravikum@amd.com>
Co-authored-by: Marius Brehler <marius.brehler@amd.com>

[ROCm/rccl commit: f9a957bbab]
This commit is contained in:
Geo Min
2025-08-20 15:07:23 -07:00
committed by GitHub
parent 3a544ed5f4
commit bec7d58b04
4 changed files with 405 additions and 0 deletions
+131
View File
@@ -0,0 +1,131 @@
import fnmatch
import json
import os
from pathlib import Path
import subprocess
import sys
from typing import Iterable, Optional, Mapping
def gha_set_output(vars: Mapping[str, str | Path]):
"""Sets values in a step's output parameters.
This appends to the file located at the $GITHUB_OUTPUT environment variable.
See
* https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
* https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
"""
print(f"Setting github output:\n{vars}")
step_output_file = os.getenv("GITHUB_OUTPUT")
if not step_output_file:
print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
return
with open(step_output_file, "a") as f:
f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
"""Returns the paths of modified files relative to the base reference."""
try:
return subprocess.run(
["git", "diff", "--name-only", base_ref],
stdout=subprocess.PIPE,
check=True,
text=True,
timeout=60,
).stdout.splitlines()
except TimeoutError:
print(
"Computing modified files timed out. Not using PR diff to determine"
" jobs to run.",
file=sys.stderr,
)
return None
GITHUB_WORKFLOWS_CI_PATTERNS = [
"therock*.yml",
]
def is_path_workflow_file_related_to_ci(path: str) -> bool:
return any(
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
)
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
if paths is None:
return False
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
# Paths matching any of these patterns are considered to have no influence over
# build or test workflows so any related jobs can be skipped if all paths
# modified by a commit/PR match a pattern in this list.
SKIPPABLE_PATH_PATTERNS = [
"docs/*",
"*.gitignore",
"*.md",
"*LICENSE*",
"*NOTICES*",
'.github/CODEOWNERS',
'.github/*.md',
'.github/dependabot.yml',
'.azuredevops*',
]
def is_path_skippable(path: str) -> bool:
"""Determines if a given relative path to a file matches any skippable patterns."""
return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if at least one path is not in the skippable set."""
if paths is None:
return False
return any(not is_path_skippable(p) for p in paths)
def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if CI workflows should run given a list of modified paths."""
if paths is None:
print("No files were modified, skipping TheRock CI jobs")
return False
paths_set = set(paths)
github_workflows_paths = set(
[p for p in paths if p.startswith(".github/workflows")]
)
other_paths = paths_set - github_workflows_paths
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
print("should_ci_run_given_modified_paths findings:")
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
if related_to_ci:
print("Enabling build jobs since a related workflow file was modified")
return True
elif contains_other_non_skippable_files:
print("Enabling TheRock CI jobs since a non-skippable path was modified")
return True
else:
print(
"Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
)
return False
def main(args):
base_ref = args.get("base_ref")
modified_paths = get_modified_paths(base_ref)
print("modified_paths (max 200):", modified_paths[:200])
enable_jobs = should_ci_run_given_modified_paths(modified_paths)
output = {
'enable_therock_ci': json.dumps(enable_jobs)
}
gha_set_output(output)
if __name__ == "__main__":
args = {}
args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
main(args)
+135
View File
@@ -0,0 +1,135 @@
name: TheRock CI Linux
on:
workflow_call:
inputs:
amdgpu_families:
type: string
extra_cmake_options:
type: string
permissions:
contents: read
jobs:
therock-build-linux:
name: Build Linux Packages
runs-on: azure-linux-scale-rocm
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050
options: -v /runner/config:/home/awsconfig/
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
TEATIME_FORCE_INTERACTIVE: 0
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
steps:
- name: Checkout TheRock repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: 5f35280878ca80755c5456dfdb527c29d3f5058c
- name: Checkout rccl repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl"
path: rccl
- name: Checkout rccl-tests repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl-tests"
path: rccl-tests
- name: Runner Health Settings
run: |
df -h
echo cmake --version
echo "Installed Python versions:"
ls -d /opt/python
echo "python: $(which python), python3: $(which python3)"
echo "Git version: $(git --version)"
git config --global --add safe.directory $PWD
git config fetch.parallel 10
- name: Fetch sources
run: |
./build_tools/fetch_sources.py --jobs 12
- name: Install python deps
run: |
pip install -r requirements.txt
pip freeze
- name: Configure Projects
env:
amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
package_version: ADHOCBUILD
extra_cmake_options: ${{ inputs.extra_cmake_options }}
BUILD_DIR: build
run: |
python3 build_tools/github_actions/build_configure.py
- name: Build therock-dist
run: cmake --build build
- name: Build therock-archives
run: cmake --build build --target therock-archives
- name: Report
#if: ${{ !cancelled() }}
run: |
echo "Full SDK du:"
echo "------------"
du -h -d 1 build/dist/rocm
echo "Artifact Archives:"
echo "------------------"
ls -lh build/artifacts/*.tar.xz
echo "Artifacts:"
echo "----------"
du -h -d 1 build/artifacts
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Create Logs index Files and upload logs
if: always()
run: |
python3 build_tools/github_actions/create_log_index.py \
--build-dir=build \
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
python3 build_tools/github_actions/upload_build_logs_to_s3.py \
--build-dir=build \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
- name: Upload artifacts
run: |
python build_tools/github_actions/upload_build_artifacts.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir build
- name: Add Links to Job Summary
if: always()
run: |
python build_tools/github_actions/upload_build_summary.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir build
therock-test-linux:
name: "Test"
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: vultr-linux-rocm
artifact_run_id: ${{ github.run_id }}
+81
View File
@@ -0,0 +1,81 @@
name: TheRock CI for rccl
on:
push:
branches:
- develop
workflow_dispatch:
permissions:
contents: read
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
setup:
runs-on: ubuntu-24.04
env:
# The commit being checked out is the merge commit for a PR. Its first
# parent will be the tip of the base branch.
BASE_REF: HEAD^
outputs:
enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
steps:
- name: "Checking out repository"
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
# We need the parent commit to do a diff
fetch-depth: 2
- name: "Configuring CI options"
id: configure
run: python .github/scripts/therock_configure_ci.py
therock-ci-linux:
name: TheRock CI Linux
needs: setup
if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
permissions:
contents: read
id-token: write
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
amdgpu_families: "gfx94X-dcgpu"
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_BUNDLE_SYSDEPS=ON
-DTHEROCK_ENABLE_COMM_LIBS=ON
-DTHEROCK_ENABLE_ROCPROFV3=ON
-DTHEROCK_USE_EXTERNAL_RCCL=ON
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
-DTHEROCK_ENABLE_MPI=ON
therock_ci_summary:
name: TheRock CI Summary
if: always()
needs:
- setup
- therock-ci-linux
runs-on: ubuntu-24.04
steps:
- name: Output failed jobs
run: |
echo '${{ toJson(needs) }}'
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
| jq --raw-output \
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
)"
if [[ "${FAILED_JOBS}" != "" ]]; then
echo "The following jobs failed: ${FAILED_JOBS}"
exit 1
fi
@@ -0,0 +1,58 @@
name: TheRock Test Packages
on:
workflow_call:
inputs:
amdgpu_families:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
jobs:
test_rccl:
name: 'Test'
runs-on: ${{ inputs.test_runs_on }}
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: 890c856134d955441790c8ed2d60ad4fb027f4e5
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Run tests on vultr-linux-node
run: |
source /home/arravikum/TheRock/.venv/bin/activate
cd /home/arravikum/cvs
pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html