diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index d3904461cd..f595a2aeb8 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -18,7 +18,7 @@ jobs: permissions: id-token: write container: - image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050 + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e options: -v /runner/config:/home/awsconfig/ env: AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} @@ -109,15 +109,17 @@ jobs: therock-test-linux-multi-node: name: "Test multi-node" + if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }} needs: [therock-build-linux] uses: ./.github/workflows/therock-test-packages-multi-node.yml with: amdgpu_families: ${{ inputs.amdgpu_families }} - test_runs_on: vultr-linux-rocm + test_runs_on: nova-linux-slurm-scale-runner artifact_run_id: ${{ github.run_id }} therock-test-linux-single-node: name: "Test single-node" + if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }} needs: [therock-build-linux] uses: ./.github/workflows/therock-test-packages-single-node.yml with: diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml index 1d866b14ac..a424b3ded2 100644 --- a/.github/workflows/therock-ci.yml +++ b/.github/workflows/therock-ci.yml @@ -44,10 +44,14 @@ jobs: permissions: contents: read id-token: write + strategy: + fail-fast: false + matrix: + amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu] uses: ./.github/workflows/therock-ci-linux.yml secrets: inherit with: - amdgpu_families: "gfx94X-dcgpu" + amdgpu_families: ${{ matrix.amdgpu_family }} extra_cmake_options: > -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_BUILD_TESTING=ON diff --git a/.github/workflows/therock-test-packages-multi-node.yml b/.github/workflows/therock-test-packages-multi-node.yml index 0aa2def6d8..acb459a028 100644 --- a/.github/workflows/therock-test-packages-multi-node.yml +++ b/.github/workflows/therock-test-packages-multi-node.yml @@ -50,8 +50,19 @@ jobs: FETCH_ARTIFACT_ARGS: "--rccl" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} - - name: Test + # The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster. + # salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes. + - name: Test gfx950 + if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }} run: | - source /home/arravikum/TheRock/.venv/bin/activate - cd /home/arravikum/cvs - pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html + salloc -N 4 -p meta64 --exclusive bash -c " + source /home/arravikum/TheRock/.venv/bin/activate && + cd /home/arravikum/cvs && + python input/setup.py && + pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \ + --cluster_file ./input/cluster.json \ + --config_file ./input/mi350_config.json \ + --log-file=/tmp/rccl_log.log \ + --html=/home/arravikum/cvs/ci_test_report.html \ + --capture=tee-sys \ + --self-contained-html"