Enable multi node rccl tests on MI350x slurm cluster. (#1900)

* Add tests on slurm cluster

* Integrate slurm.

* Add flags.

* Added dynamic selection of runners for tests and cleanup for slurm reservation

* Revert "Added dynamic selection of runners for tests and cleanup for slurm reservation"

This reverts commit d5350ff6e4f563ddd56ad81e4bc2a393ed55ba00.

* Refactor so tests run on both architectures.

* continue on error

* fail fast false on matrix

* remove scancel

* skip all single node tests

* fix pattern matching for pytest

* switch to always skip github job

* Update to latest allocation.

* Clean up workflows and update docker image.

* Updated container image published from PR #1517

* Switch back to TheRock main branch sha.

---------

Co-authored-by: arravikum <arravikum@amd.com>
Dieser Commit ist enthalten in:
Sai Enduri
2025-09-23 22:00:26 -07:00
committet von GitHub
Ursprung d86cf78810
Commit 01d16d4139
3 geänderte Dateien mit 24 neuen und 7 gelöschten Zeilen
+4 -2
Datei anzeigen
@@ -18,7 +18,7 @@ jobs:
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e
options: -v /runner/config:/home/awsconfig/
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
@@ -109,15 +109,17 @@ jobs:
therock-test-linux-multi-node:
name: "Test multi-node"
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-multi-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: vultr-linux-rocm
test_runs_on: nova-linux-slurm-scale-runner
artifact_run_id: ${{ github.run_id }}
therock-test-linux-single-node:
name: "Test single-node"
if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }}
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-single-node.yml
with:
+5 -1
Datei anzeigen
@@ -44,10 +44,14 @@ jobs:
permissions:
contents: read
id-token: write
strategy:
fail-fast: false
matrix:
amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu]
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
amdgpu_families: "gfx94X-dcgpu"
amdgpu_families: ${{ matrix.amdgpu_family }}
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
@@ -50,8 +50,19 @@ jobs:
FETCH_ARTIFACT_ARGS: "--rccl"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
- name: Test gfx950
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
run: |
source /home/arravikum/TheRock/.venv/bin/activate
cd /home/arravikum/cvs
pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html
salloc -N 4 -p meta64 --exclusive bash -c "
source /home/arravikum/TheRock/.venv/bin/activate &&
cd /home/arravikum/cvs &&
python input/setup.py &&
pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
--cluster_file ./input/cluster.json \
--config_file ./input/mi350_config.json \
--log-file=/tmp/rccl_log.log \
--html=/home/arravikum/cvs/ci_test_report.html \
--capture=tee-sys \
--self-contained-html"