diff --git a/projects/rccl/.github/workflows/therock-ci-linux.yml b/projects/rccl/.github/workflows/therock-ci-linux.yml index 9ba0b563cd..d3904461cd 100644 --- a/projects/rccl/.github/workflows/therock-ci-linux.yml +++ b/projects/rccl/.github/workflows/therock-ci-linux.yml @@ -98,38 +98,29 @@ jobs: aws-region: us-east-2 role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external - - name: Create Logs index Files and upload logs + - name: Post Build Upload if: always() run: | - python3 build_tools/github_actions/create_log_index.py \ - --build-dir=build \ - --amdgpu-family=${{ env.AMDGPU_FAMILIES }} - - python3 build_tools/github_actions/upload_build_logs_to_s3.py \ - --build-dir=build \ - --run-id ${{ github.run_id }} \ - --amdgpu-family ${{ env.AMDGPU_FAMILIES }} - - - name: Upload artifacts - run: | - python build_tools/github_actions/upload_build_artifacts.py \ + python3 build_tools/github_actions/post_build_upload.py \ --run-id ${{ github.run_id }} \ --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ - --build-dir build + --build-dir build \ + --upload - - name: Add Links to Job Summary - if: always() - run: | - python build_tools/github_actions/upload_build_summary.py \ - --run-id ${{ github.run_id }} \ - --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ - --build-dir build - - therock-test-linux: - name: "Test" + therock-test-linux-multi-node: + name: "Test multi-node" needs: [therock-build-linux] - uses: ./.github/workflows/therock-test-packages.yml + uses: ./.github/workflows/therock-test-packages-multi-node.yml with: amdgpu_families: ${{ inputs.amdgpu_families }} test_runs_on: vultr-linux-rocm artifact_run_id: ${{ github.run_id }} + + therock-test-linux-single-node: + name: "Test single-node" + needs: [therock-build-linux] + uses: ./.github/workflows/therock-test-packages-single-node.yml + with: + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: linux-mi325-1gpu-ossci-rocm + artifact_run_id: ${{ github.run_id }} diff --git a/projects/rccl/.github/workflows/therock-ci.yml b/projects/rccl/.github/workflows/therock-ci.yml index f36bcc224f..1d866b14ac 100644 --- a/projects/rccl/.github/workflows/therock-ci.yml +++ b/projects/rccl/.github/workflows/therock-ci.yml @@ -57,7 +57,7 @@ jobs: -DTHEROCK_USE_EXTERNAL_RCCL=ON -DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON -DTHEROCK_RCCL_SOURCE_DIR=./rccl - -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests + -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests -DTHEROCK_ENABLE_MPI=ON therock_ci_summary: diff --git a/projects/rccl/.github/workflows/therock-test-packages.yml b/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml similarity index 93% rename from projects/rccl/.github/workflows/therock-test-packages.yml rename to projects/rccl/.github/workflows/therock-test-packages-multi-node.yml index 72ae345079..0aa2def6d8 100644 --- a/projects/rccl/.github/workflows/therock-test-packages.yml +++ b/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml @@ -1,4 +1,4 @@ -name: TheRock Test Packages +name: TheRock Test Packages multi-node on: workflow_call: @@ -22,8 +22,8 @@ permissions: contents: read jobs: - test_rccl: - name: 'Test' + test_rccl_multi_node: + name: 'Test multi-node' runs-on: ${{ inputs.test_runs_on }} defaults: run: @@ -50,9 +50,8 @@ jobs: FETCH_ARTIFACT_ARGS: "--rccl" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} - - name: Run tests on vultr-linux-node + - name: Test run: | source /home/arravikum/TheRock/.venv/bin/activate cd /home/arravikum/cvs pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html - diff --git a/projects/rccl/.github/workflows/therock-test-packages-single-node.yml b/projects/rccl/.github/workflows/therock-test-packages-single-node.yml new file mode 100644 index 0000000000..809ae8113f --- /dev/null +++ b/projects/rccl/.github/workflows/therock-test-packages-single-node.yml @@ -0,0 +1,69 @@ +name: TheRock Test Packages single-node + +on: + workflow_call: + inputs: + amdgpu_families: + type: string + test_runs_on: + type: string + artifact_run_id: + type: string + workflow_dispatch: + inputs: + amdgpu_families: + type: string + test_runs_on: + type: string + artifact_run_id: + type: string + +permissions: + contents: read + +jobs: + test_rccl_single_node: + name: 'Test single-node' + runs-on: ${{ inputs.test_runs_on }} + container: + image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26 + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 992 + --env-file /etc/podinfo/gha-gpu-isolation-settings + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}" + OUTPUT_ARTIFACTS_DIR: "./build" + THEROCK_BIN_DIR: "./build/bin" + steps: + - name: Checkout Repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "ROCm/TheRock" + ref: f89dcd5c5625baecb467b9287e952c5c819073fd + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: "--rccl --tests" + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + - name: Test + timeout-minutes: 15 + # Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the + # RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed. + # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests. + run: | + pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \ + --log-cli-level=info \ + -k "not test_rccl_correctness_tests"