[TheRock CI] Adding single node tests for RCCL (#1876)

* Add single-node testing

* Adding single node test

* Adding quotes

* fix typo

* Adding test flag

* No MPI

* Adding openmpi install

* Adding comment

* PR comments

* Missing proj

* Adding half

* Adding rocr runtime

* Adding them all'

* new sha

* Fixing script

* Removing confusing skip test case

* Adding docs

* Update .github/workflows/therock-test-packages-single-node.yml

Co-authored-by: Marius Brehler <marius.brehler@amd.com>

---------

Co-authored-by: Marius Brehler <marius.brehler@amd.com>

[ROCm/rccl commit: f404624d9e]
Этот коммит содержится в:
Geo Min
2025-08-27 08:13:10 -07:00
коммит произвёл GitHub
родитель fde5d7a8be
Коммит 6db483845d
4 изменённых файлов: 90 добавлений и 31 удалений
+16 -25
Просмотреть файл
@@ -98,38 +98,29 @@ jobs:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Create Logs index Files and upload logs
- name: Post Build Upload
if: always()
run: |
python3 build_tools/github_actions/create_log_index.py \
--build-dir=build \
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
python3 build_tools/github_actions/upload_build_logs_to_s3.py \
--build-dir=build \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
- name: Upload artifacts
run: |
python build_tools/github_actions/upload_build_artifacts.py \
python3 build_tools/github_actions/post_build_upload.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir build
--build-dir build \
--upload
- name: Add Links to Job Summary
if: always()
run: |
python build_tools/github_actions/upload_build_summary.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir build
therock-test-linux:
name: "Test"
therock-test-linux-multi-node:
name: "Test multi-node"
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages.yml
uses: ./.github/workflows/therock-test-packages-multi-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: vultr-linux-rocm
artifact_run_id: ${{ github.run_id }}
therock-test-linux-single-node:
name: "Test single-node"
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-single-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: linux-mi325-1gpu-ossci-rocm
artifact_run_id: ${{ github.run_id }}
+1 -1
Просмотреть файл
@@ -57,7 +57,7 @@ jobs:
-DTHEROCK_USE_EXTERNAL_RCCL=ON
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
-DTHEROCK_ENABLE_MPI=ON
therock_ci_summary:
+4 -5
Просмотреть файл
@@ -1,4 +1,4 @@
name: TheRock Test Packages
name: TheRock Test Packages multi-node
on:
workflow_call:
@@ -22,8 +22,8 @@ permissions:
contents: read
jobs:
test_rccl:
name: 'Test'
test_rccl_multi_node:
name: 'Test multi-node'
runs-on: ${{ inputs.test_runs_on }}
defaults:
run:
@@ -50,9 +50,8 @@ jobs:
FETCH_ARTIFACT_ARGS: "--rccl"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Run tests on vultr-linux-node
- name: Test
run: |
source /home/arravikum/TheRock/.venv/bin/activate
cd /home/arravikum/cvs
pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html
+69
Просмотреть файл
@@ -0,0 +1,69 @@
name: TheRock Test Packages single-node
on:
workflow_call:
inputs:
amdgpu_families:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
jobs:
test_rccl_single_node:
name: 'Test single-node'
runs-on: ${{ inputs.test_runs_on }}
container:
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 992
--env-file /etc/podinfo/gha-gpu-isolation-settings
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: "./build"
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: f89dcd5c5625baecb467b9287e952c5c819073fd
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
timeout-minutes: 15
# Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
# RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
run: |
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
--log-cli-level=info \
-k "not test_rccl_correctness_tests"