diff --git a/projects/rccl/.github/workflows/therock-ci-linux.yml b/projects/rccl/.github/workflows/therock-ci-linux.yml index de21ae3382..4274795f13 100644 --- a/projects/rccl/.github/workflows/therock-ci-linux.yml +++ b/projects/rccl/.github/workflows/therock-ci-linux.yml @@ -5,6 +5,8 @@ on: inputs: amdgpu_families: type: string + artifact_group: + type: string extra_cmake_options: type: string @@ -122,10 +124,14 @@ jobs: therock-test-linux-multi-node: name: "Test multi-node" if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }} + permissions: + contents: read + id-token: write needs: [therock-build-linux] uses: ./.github/workflows/therock-test-packages-multi-node.yml with: amdgpu_families: ${{ inputs.amdgpu_families }} + artifact_group: ${{ inputs.artifact_group }} test_runs_on: nova-linux-slurm-scale-runner artifact_run_id: ${{ github.run_id }} diff --git a/projects/rccl/.github/workflows/therock-ci.yml b/projects/rccl/.github/workflows/therock-ci.yml index 08da655d91..37f53d64ca 100644 --- a/projects/rccl/.github/workflows/therock-ci.yml +++ b/projects/rccl/.github/workflows/therock-ci.yml @@ -57,6 +57,7 @@ jobs: secrets: inherit with: amdgpu_families: ${{ matrix.amdgpu_family }} + artifact_group: ${{ matrix.amdgpu_family }} extra_cmake_options: > -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_BUILD_TESTING=ON diff --git a/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml b/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml index a22b15acb5..ca0a417305 100644 --- a/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml +++ b/projects/rccl/.github/workflows/therock-test-packages-multi-node.yml @@ -5,6 +5,8 @@ on: inputs: amdgpu_families: type: string + artifact_group: + type: string test_runs_on: type: string artifact_run_id: @@ -13,13 +15,16 @@ on: inputs: amdgpu_families: type: string + artifact_group: + type: string test_runs_on: type: string artifact_run_id: type: string - + permissions: contents: read + id-token: write jobs: test_rccl_multi_node: @@ -28,6 +33,9 @@ jobs: defaults: run: shell: bash + permissions: + contents: read + id-token: write env: VENV_DIR: ${{ github.workspace }}/.venv ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}" @@ -38,16 +46,17 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: 890c856134d955441790c8ed2d60ad4fb027f4e5 + ref: 6ecc2af91fc8a4271a949005d7404bd13278c005 # 2025-10-23 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' with: ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + ARTIFACT_GROUP: ${{ inputs.artifact_group }} OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} VENV_DIR: ${{ env.VENV_DIR }} - FETCH_ARTIFACT_ARGS: "--rccl" + FETCH_ARTIFACT_ARGS: "--rccl --tests" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} # The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster. @@ -63,6 +72,25 @@ jobs: --cluster_file ./input/cluster.json \ --config_file ./input/mi350_config.json \ --log-file=/tmp/rccl_log.log \ - --html=/home/arravikum/cvs/ci_test_report.html \ + --html=/home/arravikum/cvs/test_reports/ci_test_report.html \ --capture=tee-sys \ --self-contained-html" + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + + - name: Post test report upload + if: always() + working-directory: ${{ github.workspace }} + run: | + export PYTHONPATH="${PYTHONPATH}:${{ github.workspace }}/build_tools" + python3 build_tools/github_actions/upload_test_report_script.py \ + --run-id "${{ github.run_id }}" \ + --amdgpu-family "${{ inputs.amdgpu_families }}" \ + --report-path "/home/arravikum/cvs/test_reports" \ + --log-destination "/logs/gfx950-dcgpu" \ + --index-file-name "index_rccl_test_report.html"