[TheRock CI] Adding TheRock RCCL tests (#1873)
* First commit for rccl multi node test workflow
* Adding workflow dispatch
* Added branch based pull trigger
* Changed typo in branch name
* Add input variables to push
* Removed input variables to push
* Added self hosted runner for Vultr cloud
* Skipping build and only running test
* Changed test runner label name
* Made changes to executable paths in test script
* Made changes to run
* Made changes to cd into cvs dir
* This is a dummy commit
* Added cmake options
* Modified build options
* Commiting build changes
* Adding rccl and rccl-tests
* Re-ordering rccl and rccl-tests
* adding --global command
* modified cmake command
* modified script paths
* Testing OIDC for rccl repo
* Testing OIDC for rccl repo
* Testing build and upload workflow
* use default env variable for AMDGPU families on push workflow trigger
* Adding cleanup and correct role
* Adding additional yml files
* Fixing typo';
* Adding new sha
* Adding correct gpu target
* Adding back venv bin activate
* Adding workflow dispatch for tests
* Testing
* Adding cat
* Adding cat
* Adding rocm dir change
* Adding checkout
* cat with sudo
* rccl checkout
* correcting branch
* removing sudo
* trying to adjust correct path'
* Adding output dir path
* Use docker container with pre-installed MPI
* Adding back build steps
* Fixing SHA
* Adding exclusion logic:
* Adding test
* Adding CI check
* Removing testing
* Limit to build only rccl, rccl-tests and required dependencies
* Adding test
* Removing test
* Removing quote
* Reverting test
* PR comments
---------
Co-authored-by: arravikum <arravikum@amd.com>
Co-authored-by: Marius Brehler <marius.brehler@amd.com>
[ROCm/rccl commit: f9a957bbab]
This commit is contained in:
@@ -0,0 +1,131 @@
|
||||
import fnmatch
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Iterable, Optional, Mapping
|
||||
|
||||
def gha_set_output(vars: Mapping[str, str | Path]):
|
||||
"""Sets values in a step's output parameters.
|
||||
|
||||
This appends to the file located at the $GITHUB_OUTPUT environment variable.
|
||||
|
||||
See
|
||||
* https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
|
||||
* https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
|
||||
"""
|
||||
print(f"Setting github output:\n{vars}")
|
||||
|
||||
step_output_file = os.getenv("GITHUB_OUTPUT")
|
||||
if not step_output_file:
|
||||
print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
|
||||
return
|
||||
|
||||
with open(step_output_file, "a") as f:
|
||||
f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
|
||||
|
||||
def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
|
||||
"""Returns the paths of modified files relative to the base reference."""
|
||||
try:
|
||||
return subprocess.run(
|
||||
["git", "diff", "--name-only", base_ref],
|
||||
stdout=subprocess.PIPE,
|
||||
check=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
).stdout.splitlines()
|
||||
except TimeoutError:
|
||||
print(
|
||||
"Computing modified files timed out. Not using PR diff to determine"
|
||||
" jobs to run.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
GITHUB_WORKFLOWS_CI_PATTERNS = [
|
||||
"therock*.yml",
|
||||
]
|
||||
|
||||
|
||||
def is_path_workflow_file_related_to_ci(path: str) -> bool:
|
||||
return any(
|
||||
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
|
||||
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
|
||||
)
|
||||
|
||||
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
|
||||
if paths is None:
|
||||
return False
|
||||
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
|
||||
|
||||
# Paths matching any of these patterns are considered to have no influence over
|
||||
# build or test workflows so any related jobs can be skipped if all paths
|
||||
# modified by a commit/PR match a pattern in this list.
|
||||
SKIPPABLE_PATH_PATTERNS = [
|
||||
"docs/*",
|
||||
"*.gitignore",
|
||||
"*.md",
|
||||
"*LICENSE*",
|
||||
"*NOTICES*",
|
||||
'.github/CODEOWNERS',
|
||||
'.github/*.md',
|
||||
'.github/dependabot.yml',
|
||||
'.azuredevops*',
|
||||
]
|
||||
|
||||
def is_path_skippable(path: str) -> bool:
|
||||
"""Determines if a given relative path to a file matches any skippable patterns."""
|
||||
return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
|
||||
|
||||
def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
|
||||
"""Returns true if at least one path is not in the skippable set."""
|
||||
if paths is None:
|
||||
return False
|
||||
return any(not is_path_skippable(p) for p in paths)
|
||||
|
||||
def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
|
||||
"""Returns true if CI workflows should run given a list of modified paths."""
|
||||
|
||||
if paths is None:
|
||||
print("No files were modified, skipping TheRock CI jobs")
|
||||
return False
|
||||
|
||||
paths_set = set(paths)
|
||||
github_workflows_paths = set(
|
||||
[p for p in paths if p.startswith(".github/workflows")]
|
||||
)
|
||||
other_paths = paths_set - github_workflows_paths
|
||||
|
||||
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
|
||||
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
|
||||
|
||||
print("should_ci_run_given_modified_paths findings:")
|
||||
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
|
||||
|
||||
if related_to_ci:
|
||||
print("Enabling build jobs since a related workflow file was modified")
|
||||
return True
|
||||
elif contains_other_non_skippable_files:
|
||||
print("Enabling TheRock CI jobs since a non-skippable path was modified")
|
||||
return True
|
||||
else:
|
||||
print(
|
||||
"Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
|
||||
)
|
||||
return False
|
||||
|
||||
def main(args):
|
||||
base_ref = args.get("base_ref")
|
||||
modified_paths = get_modified_paths(base_ref)
|
||||
print("modified_paths (max 200):", modified_paths[:200])
|
||||
enable_jobs = should_ci_run_given_modified_paths(modified_paths)
|
||||
output = {
|
||||
'enable_therock_ci': json.dumps(enable_jobs)
|
||||
}
|
||||
gha_set_output(output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = {}
|
||||
args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
|
||||
main(args)
|
||||
@@ -0,0 +1,135 @@
|
||||
name: TheRock CI Linux
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
amdgpu_families:
|
||||
type: string
|
||||
extra_cmake_options:
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
therock-build-linux:
|
||||
name: Build Linux Packages
|
||||
runs-on: azure-linux-scale-rocm
|
||||
permissions:
|
||||
id-token: write
|
||||
container:
|
||||
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050
|
||||
options: -v /runner/config:/home/awsconfig/
|
||||
env:
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
TEATIME_FORCE_INTERACTIVE: 0
|
||||
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
|
||||
steps:
|
||||
- name: Checkout TheRock repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: 5f35280878ca80755c5456dfdb527c29d3f5058c
|
||||
|
||||
- name: Checkout rccl repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/rccl"
|
||||
path: rccl
|
||||
|
||||
- name: Checkout rccl-tests repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/rccl-tests"
|
||||
path: rccl-tests
|
||||
|
||||
- name: Runner Health Settings
|
||||
run: |
|
||||
df -h
|
||||
echo cmake --version
|
||||
echo "Installed Python versions:"
|
||||
ls -d /opt/python
|
||||
echo "python: $(which python), python3: $(which python3)"
|
||||
echo "Git version: $(git --version)"
|
||||
git config --global --add safe.directory $PWD
|
||||
git config fetch.parallel 10
|
||||
|
||||
- name: Fetch sources
|
||||
run: |
|
||||
./build_tools/fetch_sources.py --jobs 12
|
||||
|
||||
- name: Install python deps
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
pip freeze
|
||||
|
||||
- name: Configure Projects
|
||||
env:
|
||||
amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
|
||||
package_version: ADHOCBUILD
|
||||
extra_cmake_options: ${{ inputs.extra_cmake_options }}
|
||||
BUILD_DIR: build
|
||||
run: |
|
||||
python3 build_tools/github_actions/build_configure.py
|
||||
|
||||
- name: Build therock-dist
|
||||
run: cmake --build build
|
||||
|
||||
- name: Build therock-archives
|
||||
run: cmake --build build --target therock-archives
|
||||
|
||||
- name: Report
|
||||
#if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "Full SDK du:"
|
||||
echo "------------"
|
||||
du -h -d 1 build/dist/rocm
|
||||
echo "Artifact Archives:"
|
||||
echo "------------------"
|
||||
ls -lh build/artifacts/*.tar.xz
|
||||
echo "Artifacts:"
|
||||
echo "----------"
|
||||
du -h -d 1 build/artifacts
|
||||
|
||||
- name: Configure AWS Credentials for non-forked repos
|
||||
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
|
||||
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
|
||||
with:
|
||||
aws-region: us-east-2
|
||||
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
|
||||
|
||||
- name: Create Logs index Files and upload logs
|
||||
if: always()
|
||||
run: |
|
||||
python3 build_tools/github_actions/create_log_index.py \
|
||||
--build-dir=build \
|
||||
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
python3 build_tools/github_actions/upload_build_logs_to_s3.py \
|
||||
--build-dir=build \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
- name: Upload artifacts
|
||||
run: |
|
||||
python build_tools/github_actions/upload_build_artifacts.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir build
|
||||
|
||||
- name: Add Links to Job Summary
|
||||
if: always()
|
||||
run: |
|
||||
python build_tools/github_actions/upload_build_summary.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir build
|
||||
|
||||
therock-test-linux:
|
||||
name: "Test"
|
||||
needs: [therock-build-linux]
|
||||
uses: ./.github/workflows/therock-test-packages.yml
|
||||
with:
|
||||
amdgpu_families: ${{ inputs.amdgpu_families }}
|
||||
test_runs_on: vultr-linux-rocm
|
||||
artifact_run_id: ${{ github.run_id }}
|
||||
+81
@@ -0,0 +1,81 @@
|
||||
name: TheRock CI for rccl
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- develop
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
# A PR number if a pull request and otherwise the commit hash. This cancels
|
||||
# queued and in-progress runs for the same PR (presubmit) or commit
|
||||
# (postsubmit). The workflow name is prepended to avoid conflicts between
|
||||
# different workflows.
|
||||
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-24.04
|
||||
env:
|
||||
# The commit being checked out is the merge commit for a PR. Its first
|
||||
# parent will be the tip of the base branch.
|
||||
BASE_REF: HEAD^
|
||||
outputs:
|
||||
enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
|
||||
steps:
|
||||
- name: "Checking out repository"
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
# We need the parent commit to do a diff
|
||||
fetch-depth: 2
|
||||
|
||||
- name: "Configuring CI options"
|
||||
id: configure
|
||||
run: python .github/scripts/therock_configure_ci.py
|
||||
|
||||
therock-ci-linux:
|
||||
name: TheRock CI Linux
|
||||
needs: setup
|
||||
if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
uses: ./.github/workflows/therock-ci-linux.yml
|
||||
secrets: inherit
|
||||
with:
|
||||
amdgpu_families: "gfx94X-dcgpu"
|
||||
extra_cmake_options: >
|
||||
-DTHEROCK_ENABLE_ALL=OFF
|
||||
-DTHEROCK_BUILD_TESTING=ON
|
||||
-DTHEROCK_BUNDLE_SYSDEPS=ON
|
||||
-DTHEROCK_ENABLE_COMM_LIBS=ON
|
||||
-DTHEROCK_ENABLE_ROCPROFV3=ON
|
||||
-DTHEROCK_USE_EXTERNAL_RCCL=ON
|
||||
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
|
||||
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
|
||||
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
|
||||
-DTHEROCK_ENABLE_MPI=ON
|
||||
|
||||
therock_ci_summary:
|
||||
name: TheRock CI Summary
|
||||
if: always()
|
||||
needs:
|
||||
- setup
|
||||
- therock-ci-linux
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Output failed jobs
|
||||
run: |
|
||||
echo '${{ toJson(needs) }}'
|
||||
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
|
||||
| jq --raw-output \
|
||||
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
|
||||
)"
|
||||
if [[ "${FAILED_JOBS}" != "" ]]; then
|
||||
echo "The following jobs failed: ${FAILED_JOBS}"
|
||||
exit 1
|
||||
fi
|
||||
@@ -0,0 +1,58 @@
|
||||
name: TheRock Test Packages
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
amdgpu_families:
|
||||
type: string
|
||||
test_runs_on:
|
||||
type: string
|
||||
artifact_run_id:
|
||||
type: string
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
amdgpu_families:
|
||||
type: string
|
||||
test_runs_on:
|
||||
type: string
|
||||
artifact_run_id:
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test_rccl:
|
||||
name: 'Test'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
VENV_DIR: ${{ github.workspace }}/.venv
|
||||
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
||||
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
|
||||
THEROCK_BIN_DIR: "./build/bin"
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: 890c856134d955441790c8ed2d60ad4fb027f4e5
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
with:
|
||||
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
|
||||
VENV_DIR: ${{ env.VENV_DIR }}
|
||||
FETCH_ARTIFACT_ARGS: "--rccl"
|
||||
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
||||
|
||||
- name: Run tests on vultr-linux-node
|
||||
run: |
|
||||
source /home/arravikum/TheRock/.venv/bin/activate
|
||||
cd /home/arravikum/cvs
|
||||
pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html
|
||||
|
||||
Reference in New Issue
Block a user