From bec7d58b045d17fc9739a17474d2fc71c719b88d Mon Sep 17 00:00:00 2001 From: Geo Min Date: Wed, 20 Aug 2025 15:07:23 -0700 Subject: [PATCH] [TheRock CI] Adding TheRock RCCL tests (#1873) * First commit for rccl multi node test workflow * Adding workflow dispatch * Added branch based pull trigger * Changed typo in branch name * Add input variables to push * Removed input variables to push * Added self hosted runner for Vultr cloud * Skipping build and only running test * Changed test runner label name * Made changes to executable paths in test script * Made changes to run * Made changes to cd into cvs dir * This is a dummy commit * Added cmake options * Modified build options * Commiting build changes * Adding rccl and rccl-tests * Re-ordering rccl and rccl-tests * adding --global command * modified cmake command * modified script paths * Testing OIDC for rccl repo * Testing OIDC for rccl repo * Testing build and upload workflow * use default env variable for AMDGPU families on push workflow trigger * Adding cleanup and correct role * Adding additional yml files * Fixing typo'; * Adding new sha * Adding correct gpu target * Adding back venv bin activate * Adding workflow dispatch for tests * Testing * Adding cat * Adding cat * Adding rocm dir change * Adding checkout * cat with sudo * rccl checkout * correcting branch * removing sudo * trying to adjust correct path' * Adding output dir path * Use docker container with pre-installed MPI * Adding back build steps * Fixing SHA * Adding exclusion logic: * Adding test * Adding CI check * Removing testing * Limit to build only rccl, rccl-tests and required dependencies * Adding test * Removing test * Removing quote * Reverting test * PR comments --------- Co-authored-by: arravikum Co-authored-by: Marius Brehler [ROCm/rccl commit: f9a957bbabbd916203b32ff575981f29e5dee96b] --- .../.github/scripts/therock_configure_ci.py | 131 +++++++++++++++++ .../.github/workflows/therock-ci-linux.yml | 135 ++++++++++++++++++ .../rccl/.github/workflows/therock-ci.yml | 81 +++++++++++ .../workflows/therock-test-packages.yml | 58 ++++++++ 4 files changed, 405 insertions(+) create mode 100644 projects/rccl/.github/scripts/therock_configure_ci.py create mode 100644 projects/rccl/.github/workflows/therock-ci-linux.yml create mode 100644 projects/rccl/.github/workflows/therock-ci.yml create mode 100644 projects/rccl/.github/workflows/therock-test-packages.yml diff --git a/projects/rccl/.github/scripts/therock_configure_ci.py b/projects/rccl/.github/scripts/therock_configure_ci.py new file mode 100644 index 0000000000..2afff17f95 --- /dev/null +++ b/projects/rccl/.github/scripts/therock_configure_ci.py @@ -0,0 +1,131 @@ +import fnmatch +import json +import os +from pathlib import Path +import subprocess +import sys +from typing import Iterable, Optional, Mapping + +def gha_set_output(vars: Mapping[str, str | Path]): + """Sets values in a step's output parameters. + + This appends to the file located at the $GITHUB_OUTPUT environment variable. + + See + * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter + * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs + """ + print(f"Setting github output:\n{vars}") + + step_output_file = os.getenv("GITHUB_OUTPUT") + if not step_output_file: + print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs") + return + + with open(step_output_file, "a") as f: + f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items()) + +def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]: + """Returns the paths of modified files relative to the base reference.""" + try: + return subprocess.run( + ["git", "diff", "--name-only", base_ref], + stdout=subprocess.PIPE, + check=True, + text=True, + timeout=60, + ).stdout.splitlines() + except TimeoutError: + print( + "Computing modified files timed out. Not using PR diff to determine" + " jobs to run.", + file=sys.stderr, + ) + return None + +GITHUB_WORKFLOWS_CI_PATTERNS = [ + "therock*.yml", +] + + +def is_path_workflow_file_related_to_ci(path: str) -> bool: + return any( + fnmatch.fnmatch(path, ".github/workflows/" + pattern) + for pattern in GITHUB_WORKFLOWS_CI_PATTERNS + ) + +def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool: + if paths is None: + return False + return any(is_path_workflow_file_related_to_ci(p) for p in paths) + +# Paths matching any of these patterns are considered to have no influence over +# build or test workflows so any related jobs can be skipped if all paths +# modified by a commit/PR match a pattern in this list. +SKIPPABLE_PATH_PATTERNS = [ + "docs/*", + "*.gitignore", + "*.md", + "*LICENSE*", + "*NOTICES*", + '.github/CODEOWNERS', + '.github/*.md', + '.github/dependabot.yml', + '.azuredevops*', +] + +def is_path_skippable(path: str) -> bool: + """Determines if a given relative path to a file matches any skippable patterns.""" + return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS) + +def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool: + """Returns true if at least one path is not in the skippable set.""" + if paths is None: + return False + return any(not is_path_skippable(p) for p in paths) + +def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool: + """Returns true if CI workflows should run given a list of modified paths.""" + + if paths is None: + print("No files were modified, skipping TheRock CI jobs") + return False + + paths_set = set(paths) + github_workflows_paths = set( + [p for p in paths if p.startswith(".github/workflows")] + ) + other_paths = paths_set - github_workflows_paths + + related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths) + contains_other_non_skippable_files = check_for_non_skippable_path(other_paths) + + print("should_ci_run_given_modified_paths findings:") + print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}") + + if related_to_ci: + print("Enabling build jobs since a related workflow file was modified") + return True + elif contains_other_non_skippable_files: + print("Enabling TheRock CI jobs since a non-skippable path was modified") + return True + else: + print( + "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs" + ) + return False + +def main(args): + base_ref = args.get("base_ref") + modified_paths = get_modified_paths(base_ref) + print("modified_paths (max 200):", modified_paths[:200]) + enable_jobs = should_ci_run_given_modified_paths(modified_paths) + output = { + 'enable_therock_ci': json.dumps(enable_jobs) + } + gha_set_output(output) + +if __name__ == "__main__": + args = {} + args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1") + main(args) diff --git a/projects/rccl/.github/workflows/therock-ci-linux.yml b/projects/rccl/.github/workflows/therock-ci-linux.yml new file mode 100644 index 0000000000..49cc64b0ea --- /dev/null +++ b/projects/rccl/.github/workflows/therock-ci-linux.yml @@ -0,0 +1,135 @@ +name: TheRock CI Linux + +on: + workflow_call: + inputs: + amdgpu_families: + type: string + extra_cmake_options: + type: string + +permissions: + contents: read + +jobs: + therock-build-linux: + name: Build Linux Packages + runs-on: azure-linux-scale-rocm + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050 + options: -v /runner/config:/home/awsconfig/ + env: + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + TEATIME_FORCE_INTERACTIVE: 0 + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + steps: + - name: Checkout TheRock repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "ROCm/TheRock" + ref: 5f35280878ca80755c5456dfdb527c29d3f5058c + + - name: Checkout rccl repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "ROCm/rccl" + path: rccl + + - name: Checkout rccl-tests repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "ROCm/rccl-tests" + path: rccl-tests + + - name: Runner Health Settings + run: | + df -h + echo cmake --version + echo "Installed Python versions:" + ls -d /opt/python + echo "python: $(which python), python3: $(which python3)" + echo "Git version: $(git --version)" + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Fetch sources + run: | + ./build_tools/fetch_sources.py --jobs 12 + + - name: Install python deps + run: | + pip install -r requirements.txt + pip freeze + + - name: Configure Projects + env: + amdgpu_families: ${{ env.AMDGPU_FAMILIES }} + package_version: ADHOCBUILD + extra_cmake_options: ${{ inputs.extra_cmake_options }} + BUILD_DIR: build + run: | + python3 build_tools/github_actions/build_configure.py + + - name: Build therock-dist + run: cmake --build build + + - name: Build therock-archives + run: cmake --build build --target therock-archives + + - name: Report + #if: ${{ !cancelled() }} + run: | + echo "Full SDK du:" + echo "------------" + du -h -d 1 build/dist/rocm + echo "Artifact Archives:" + echo "------------------" + ls -lh build/artifacts/*.tar.xz + echo "Artifacts:" + echo "----------" + du -h -d 1 build/artifacts + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + + - name: Create Logs index Files and upload logs + if: always() + run: | + python3 build_tools/github_actions/create_log_index.py \ + --build-dir=build \ + --amdgpu-family=${{ env.AMDGPU_FAMILIES }} + + python3 build_tools/github_actions/upload_build_logs_to_s3.py \ + --build-dir=build \ + --run-id ${{ github.run_id }} \ + --amdgpu-family ${{ env.AMDGPU_FAMILIES }} + + - name: Upload artifacts + run: | + python build_tools/github_actions/upload_build_artifacts.py \ + --run-id ${{ github.run_id }} \ + --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ + --build-dir build + + - name: Add Links to Job Summary + if: always() + run: | + python build_tools/github_actions/upload_build_summary.py \ + --run-id ${{ github.run_id }} \ + --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ + --build-dir build + + therock-test-linux: + name: "Test" + needs: [therock-build-linux] + uses: ./.github/workflows/therock-test-packages.yml + with: + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: vultr-linux-rocm + artifact_run_id: ${{ github.run_id }} diff --git a/projects/rccl/.github/workflows/therock-ci.yml b/projects/rccl/.github/workflows/therock-ci.yml new file mode 100644 index 0000000000..f36bcc224f --- /dev/null +++ b/projects/rccl/.github/workflows/therock-ci.yml @@ -0,0 +1,81 @@ +name: TheRock CI for rccl + +on: + push: + branches: + - develop + workflow_dispatch: + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + runs-on: ubuntu-24.04 + env: + # The commit being checked out is the merge commit for a PR. Its first + # parent will be the tip of the base branch. + BASE_REF: HEAD^ + outputs: + enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }} + steps: + - name: "Checking out repository" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + # We need the parent commit to do a diff + fetch-depth: 2 + + - name: "Configuring CI options" + id: configure + run: python .github/scripts/therock_configure_ci.py + + therock-ci-linux: + name: TheRock CI Linux + needs: setup + if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }} + permissions: + contents: read + id-token: write + uses: ./.github/workflows/therock-ci-linux.yml + secrets: inherit + with: + amdgpu_families: "gfx94X-dcgpu" + extra_cmake_options: > + -DTHEROCK_ENABLE_ALL=OFF + -DTHEROCK_BUILD_TESTING=ON + -DTHEROCK_BUNDLE_SYSDEPS=ON + -DTHEROCK_ENABLE_COMM_LIBS=ON + -DTHEROCK_ENABLE_ROCPROFV3=ON + -DTHEROCK_USE_EXTERNAL_RCCL=ON + -DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON + -DTHEROCK_RCCL_SOURCE_DIR=./rccl + -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests + -DTHEROCK_ENABLE_MPI=ON + + therock_ci_summary: + name: TheRock CI Summary + if: always() + needs: + - setup + - therock-ci-linux + runs-on: ubuntu-24.04 + steps: + - name: Output failed jobs + run: | + echo '${{ toJson(needs) }}' + FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ + | jq --raw-output \ + 'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \ + )" + if [[ "${FAILED_JOBS}" != "" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + fi diff --git a/projects/rccl/.github/workflows/therock-test-packages.yml b/projects/rccl/.github/workflows/therock-test-packages.yml new file mode 100644 index 0000000000..72ae345079 --- /dev/null +++ b/projects/rccl/.github/workflows/therock-test-packages.yml @@ -0,0 +1,58 @@ +name: TheRock Test Packages + +on: + workflow_call: + inputs: + amdgpu_families: + type: string + test_runs_on: + type: string + artifact_run_id: + type: string + workflow_dispatch: + inputs: + amdgpu_families: + type: string + test_runs_on: + type: string + artifact_run_id: + type: string + +permissions: + contents: read + +jobs: + test_rccl: + name: 'Test' + runs-on: ${{ inputs.test_runs_on }} + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}" + OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm + THEROCK_BIN_DIR: "./build/bin" + steps: + - name: Checkout Repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "ROCm/TheRock" + ref: 890c856134d955441790c8ed2d60ad4fb027f4e5 + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: "--rccl" + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + - name: Run tests on vultr-linux-node + run: | + source /home/arravikum/TheRock/.venv/bin/activate + cd /home/arravikum/cvs + pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html +