From 4b295c9893fa67dbff43702511d818883b704af3 Mon Sep 17 00:00:00 2001 From: Geo Min Date: Tue, 13 Jan 2026 08:35:58 -0800 Subject: [PATCH] [TheRock CI] Adding working single node tests (#2142) * Adding working single node tests * Revert to old docker sha * adding back no perf tests --------- Co-authored-by: Aravind Ravikumar --- .github/workflows/therock-ci-linux.yml | 4 ++-- .../workflows/therock-test-packages-multi-node.yml | 2 +- .../workflows/therock-test-packages-single-node.yml | 11 +++++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index b5d88c773a..f544549c7a 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -34,7 +34,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit + ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Checkout rccl repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -143,5 +143,5 @@ jobs: with: amdgpu_families: ${{ inputs.amdgpu_families }} artifact_group: ${{ inputs.artifact_group }} - test_runs_on: linux-mi325-1gpu-ossci-rocm-frac + test_runs_on: linux-mi325-4gpu-ossci-rocm artifact_run_id: ${{ github.run_id }} diff --git a/.github/workflows/therock-test-packages-multi-node.yml b/.github/workflows/therock-test-packages-multi-node.yml index f4ca2bf4e4..db218be4db 100644 --- a/.github/workflows/therock-test-packages-multi-node.yml +++ b/.github/workflows/therock-test-packages-multi-node.yml @@ -46,7 +46,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit + ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' diff --git a/.github/workflows/therock-test-packages-single-node.yml b/.github/workflows/therock-test-packages-single-node.yml index 71c7c39d5e..40cd09908d 100644 --- a/.github/workflows/therock-test-packages-single-node.yml +++ b/.github/workflows/therock-test-packages-single-node.yml @@ -30,13 +30,16 @@ jobs: name: 'Test single-node' runs-on: ${{ inputs.test_runs_on }} container: - image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26 + image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98 options: --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 + --ulimit memlock=-1:-1 + --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 defaults: run: shell: bash @@ -50,7 +53,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit + ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' @@ -70,5 +73,5 @@ jobs: # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests. run: | pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \ - --log-cli-level=info \ - -k "not test_rccl_correctness_tests" + -k "not test_rccl_correctness_tests" \ + --log-cli-level=info