Enable Robust Multi-Node RCCL Testing with cvs-sbatch and Improve CI Reliability (#2123)
* sbatch changes and TheRock SHA update * Move tests location from /home to /apps/cvs_tests * Add comments and move credential.ini file to /apps/cvs_tests * Changed salloc reservation to rccl reservation --------- Co-authored-by: Aravind Ravikumar <arravikum@amd.com>
This commit is contained in:
committed by
GitHub
parent
4b295c9893
commit
239d62f545
@@ -39,8 +39,9 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
VENV_DIR: ${{ github.workspace }}/.venv
|
VENV_DIR: ${{ github.workspace }}/.venv
|
||||||
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
||||||
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
|
OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm
|
||||||
THEROCK_BIN_DIR: "./build/bin"
|
THEROCK_BIN_DIR: "./build/bin"
|
||||||
|
AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repository
|
- name: Checkout Repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
@@ -61,20 +62,11 @@ jobs:
|
|||||||
|
|
||||||
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
|
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
|
||||||
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
|
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
|
||||||
|
# sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives
|
||||||
- name: Test gfx950
|
- name: Test gfx950
|
||||||
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
|
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
|
||||||
run: |
|
run: |
|
||||||
salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
|
SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch
|
||||||
source /home/arravikum/TheRock/.venv/bin/activate &&
|
|
||||||
cd /home/arravikum/cvs &&
|
|
||||||
python input/setup.py &&
|
|
||||||
pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
|
|
||||||
--cluster_file ./input/cluster.json \
|
|
||||||
--config_file ./input/mi350_config.json \
|
|
||||||
--log-file=/tmp/rccl_log.log \
|
|
||||||
--html=/home/arravikum/cvs/test_reports/ci_test_report.html \
|
|
||||||
--capture=tee-sys \
|
|
||||||
--self-contained-html"
|
|
||||||
|
|
||||||
- name: Configure AWS Credentials for non-forked repos
|
- name: Configure AWS Credentials for non-forked repos
|
||||||
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
|
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
|
||||||
@@ -91,6 +83,6 @@ jobs:
|
|||||||
python3 build_tools/github_actions/upload_test_report_script.py \
|
python3 build_tools/github_actions/upload_test_report_script.py \
|
||||||
--run-id "${{ github.run_id }}" \
|
--run-id "${{ github.run_id }}" \
|
||||||
--amdgpu-family "${{ inputs.amdgpu_families }}" \
|
--amdgpu-family "${{ inputs.amdgpu_families }}" \
|
||||||
--report-path "/home/arravikum/cvs/test_reports" \
|
--report-path "/apps/cvs_tests/test_reports" \
|
||||||
--log-destination "/logs/gfx950-dcgpu" \
|
--log-destination "/logs/gfx950-dcgpu" \
|
||||||
--index-file-name "index_rccl_test_report.html"
|
--index-file-name "index_rccl_test_report.html"
|
||||||
|
|||||||
Reference in New Issue
Block a user