dfdb64572c
* Adding working single node tests
* Revert to old docker sha
* adding back no perf tests
---------
Co-authored-by: Aravind Ravikumar <arravikum@amd.com>
[ROCm/rccl commit: 4b295c9893]
78 rindas
2.5 KiB
YAML
78 rindas
2.5 KiB
YAML
name: TheRock Test Packages single-node
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
amdgpu_families:
|
|
type: string
|
|
artifact_group:
|
|
type: string
|
|
test_runs_on:
|
|
type: string
|
|
artifact_run_id:
|
|
type: string
|
|
workflow_dispatch:
|
|
inputs:
|
|
amdgpu_families:
|
|
type: string
|
|
artifact_group:
|
|
type: string
|
|
test_runs_on:
|
|
type: string
|
|
artifact_run_id:
|
|
type: string
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
jobs:
|
|
test_rccl_single_node:
|
|
name: 'Test single-node'
|
|
runs-on: ${{ inputs.test_runs_on }}
|
|
container:
|
|
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98
|
|
options: --ipc host
|
|
--group-add video
|
|
--device /dev/kfd
|
|
--device /dev/dri
|
|
--group-add 110
|
|
--ulimit memlock=-1:-1
|
|
--security-opt seccomp=unconfined
|
|
--env-file /etc/podinfo/gha-gpu-isolation-settings
|
|
--user 0:0
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
env:
|
|
VENV_DIR: ${{ github.workspace }}/.venv
|
|
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
|
OUTPUT_ARTIFACTS_DIR: "./build"
|
|
THEROCK_BIN_DIR: "./build/bin"
|
|
steps:
|
|
- name: Checkout Repository
|
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
with:
|
|
repository: "ROCm/TheRock"
|
|
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
|
|
|
|
- name: Run setup test environment workflow
|
|
uses: './.github/actions/setup_test_environment'
|
|
with:
|
|
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
|
|
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
|
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
|
|
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
|
|
VENV_DIR: ${{ env.VENV_DIR }}
|
|
FETCH_ARTIFACT_ARGS: "--rccl --tests"
|
|
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
|
|
|
- name: Test
|
|
timeout-minutes: 15
|
|
# Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
|
|
# RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
|
|
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
|
|
run: |
|
|
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
|
|
-k "not test_rccl_correctness_tests" \
|
|
--log-cli-level=info
|