[TheRock CI] Adding single node tests for RCCL (#1876)
* Add single-node testing
* Adding single node test
* Adding quotes
* fix typo
* Adding test flag
* No MPI
* Adding openmpi install
* Adding comment
* PR comments
* Missing proj
* Adding half
* Adding rocr runtime
* Adding them all'
* new sha
* Fixing script
* Removing confusing skip test case
* Adding docs
* Update .github/workflows/therock-test-packages-single-node.yml
Co-authored-by: Marius Brehler <marius.brehler@amd.com>
---------
Co-authored-by: Marius Brehler <marius.brehler@amd.com>
[ROCm/rccl commit: f404624d9e]
Этот коммит содержится в:
поставляемый
+16
-25
@@ -98,38 +98,29 @@ jobs:
|
||||
aws-region: us-east-2
|
||||
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
|
||||
|
||||
- name: Create Logs index Files and upload logs
|
||||
- name: Post Build Upload
|
||||
if: always()
|
||||
run: |
|
||||
python3 build_tools/github_actions/create_log_index.py \
|
||||
--build-dir=build \
|
||||
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
python3 build_tools/github_actions/upload_build_logs_to_s3.py \
|
||||
--build-dir=build \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
- name: Upload artifacts
|
||||
run: |
|
||||
python build_tools/github_actions/upload_build_artifacts.py \
|
||||
python3 build_tools/github_actions/post_build_upload.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir build
|
||||
--build-dir build \
|
||||
--upload
|
||||
|
||||
- name: Add Links to Job Summary
|
||||
if: always()
|
||||
run: |
|
||||
python build_tools/github_actions/upload_build_summary.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir build
|
||||
|
||||
therock-test-linux:
|
||||
name: "Test"
|
||||
therock-test-linux-multi-node:
|
||||
name: "Test multi-node"
|
||||
needs: [therock-build-linux]
|
||||
uses: ./.github/workflows/therock-test-packages.yml
|
||||
uses: ./.github/workflows/therock-test-packages-multi-node.yml
|
||||
with:
|
||||
amdgpu_families: ${{ inputs.amdgpu_families }}
|
||||
test_runs_on: vultr-linux-rocm
|
||||
artifact_run_id: ${{ github.run_id }}
|
||||
|
||||
therock-test-linux-single-node:
|
||||
name: "Test single-node"
|
||||
needs: [therock-build-linux]
|
||||
uses: ./.github/workflows/therock-test-packages-single-node.yml
|
||||
with:
|
||||
amdgpu_families: ${{ inputs.amdgpu_families }}
|
||||
test_runs_on: linux-mi325-1gpu-ossci-rocm
|
||||
artifact_run_id: ${{ github.run_id }}
|
||||
|
||||
поставляемый
+1
-1
@@ -57,7 +57,7 @@ jobs:
|
||||
-DTHEROCK_USE_EXTERNAL_RCCL=ON
|
||||
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
|
||||
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
|
||||
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
|
||||
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
|
||||
-DTHEROCK_ENABLE_MPI=ON
|
||||
|
||||
therock_ci_summary:
|
||||
|
||||
+4
-5
@@ -1,4 +1,4 @@
|
||||
name: TheRock Test Packages
|
||||
name: TheRock Test Packages multi-node
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
@@ -22,8 +22,8 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test_rccl:
|
||||
name: 'Test'
|
||||
test_rccl_multi_node:
|
||||
name: 'Test multi-node'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
defaults:
|
||||
run:
|
||||
@@ -50,9 +50,8 @@ jobs:
|
||||
FETCH_ARTIFACT_ARGS: "--rccl"
|
||||
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
||||
|
||||
- name: Run tests on vultr-linux-node
|
||||
- name: Test
|
||||
run: |
|
||||
source /home/arravikum/TheRock/.venv/bin/activate
|
||||
cd /home/arravikum/cvs
|
||||
pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html
|
||||
|
||||
+69
@@ -0,0 +1,69 @@
|
||||
name: TheRock Test Packages single-node
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
amdgpu_families:
|
||||
type: string
|
||||
test_runs_on:
|
||||
type: string
|
||||
artifact_run_id:
|
||||
type: string
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
amdgpu_families:
|
||||
type: string
|
||||
test_runs_on:
|
||||
type: string
|
||||
artifact_run_id:
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test_rccl_single_node:
|
||||
name: 'Test single-node'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
container:
|
||||
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
|
||||
options: --ipc host
|
||||
--group-add video
|
||||
--device /dev/kfd
|
||||
--device /dev/dri
|
||||
--group-add 992
|
||||
--env-file /etc/podinfo/gha-gpu-isolation-settings
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
VENV_DIR: ${{ github.workspace }}/.venv
|
||||
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
||||
OUTPUT_ARTIFACTS_DIR: "./build"
|
||||
THEROCK_BIN_DIR: "./build/bin"
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: f89dcd5c5625baecb467b9287e952c5c819073fd
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
with:
|
||||
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
|
||||
VENV_DIR: ${{ env.VENV_DIR }}
|
||||
FETCH_ARTIFACT_ARGS: "--rccl --tests"
|
||||
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
||||
|
||||
- name: Test
|
||||
timeout-minutes: 15
|
||||
# Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
|
||||
# RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
|
||||
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
|
||||
run: |
|
||||
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
|
||||
--log-cli-level=info \
|
||||
-k "not test_rccl_correctness_tests"
|
||||
Ссылка в новой задаче
Block a user