From c8da880dc7e2ff399031a43099989c2e6b8cf57a Mon Sep 17 00:00:00 2001 From: "Kapil S. Pawar" Date: Tue, 11 Nov 2025 10:11:19 -0600 Subject: [PATCH] Added Functional Tests for CSV Tuner Plugin (#1968) * Add functional tests for CSV Tuner Plugin * Updated directory structure * Updated and renamed directories * Updated csv conf files * Updated readme * Updated readme * Updated readme --- test/ext-plugins/.gitignore | 20 + test/ext-plugins/README.md | 159 +++++++ .../csv_confs/incorrect_values_config.conf | 61 +++ .../assets/csv_confs/multinode_config.conf | 104 +++++ .../assets/csv_confs/no_matching_config.conf | 44 ++ .../assets/csv_confs/singlenode_config.conf | 39 ++ .../unsupported_algo_proto_config.conf | 40 ++ .../valid_config_with_wildcards.conf | 39 ++ .../valid_config_without_wildcards.conf | 40 ++ test/ext-plugins/pytest.ini | 12 + test/ext-plugins/requirements.txt | 2 + test/ext-plugins/tests/conftest.py | 109 +++++ .../tests/ext-tuner/test_allgather.py | 440 ++++++++++++++++++ .../tests/ext-tuner/test_allreduce.py | 431 +++++++++++++++++ .../tests/ext-tuner/test_broadcast.py | 428 +++++++++++++++++ .../tests/ext-tuner/test_reduce.py | 428 +++++++++++++++++ .../tests/ext-tuner/test_reducescatter.py | 428 +++++++++++++++++ 17 files changed, 2824 insertions(+) create mode 100644 test/ext-plugins/.gitignore create mode 100644 test/ext-plugins/README.md create mode 100644 test/ext-plugins/assets/csv_confs/incorrect_values_config.conf create mode 100644 test/ext-plugins/assets/csv_confs/multinode_config.conf create mode 100644 test/ext-plugins/assets/csv_confs/no_matching_config.conf create mode 100644 test/ext-plugins/assets/csv_confs/singlenode_config.conf create mode 100644 test/ext-plugins/assets/csv_confs/unsupported_algo_proto_config.conf create mode 100644 test/ext-plugins/assets/csv_confs/valid_config_with_wildcards.conf create mode 100644 test/ext-plugins/assets/csv_confs/valid_config_without_wildcards.conf create mode 100644 test/ext-plugins/pytest.ini create mode 100644 test/ext-plugins/requirements.txt create mode 100644 test/ext-plugins/tests/conftest.py create mode 100644 test/ext-plugins/tests/ext-tuner/test_allgather.py create mode 100644 test/ext-plugins/tests/ext-tuner/test_allreduce.py create mode 100644 test/ext-plugins/tests/ext-tuner/test_broadcast.py create mode 100644 test/ext-plugins/tests/ext-tuner/test_reduce.py create mode 100644 test/ext-plugins/tests/ext-tuner/test_reducescatter.py diff --git a/test/ext-plugins/.gitignore b/test/ext-plugins/.gitignore new file mode 100644 index 0000000000..2a2778dbca --- /dev/null +++ b/test/ext-plugins/.gitignore @@ -0,0 +1,20 @@ +# Ignore Python cache and virtual environment folders +__pycache__/ +*.pyc +*.pyo +*.pyd + +# Ignore pytest cache +.pytest_cache/ +.cache/ + +# Ignore log folders +logs/ +log/ +*.log + +# Ignore virtual environment folders +venv/ + +# Ignore build artifacts +build/ diff --git a/test/ext-plugins/README.md b/test/ext-plugins/README.md new file mode 100644 index 0000000000..01b2a68b1e --- /dev/null +++ b/test/ext-plugins/README.md @@ -0,0 +1,159 @@ +# RCCL CSV Tuner Plugin Tests + +## Description + +This directory contains automated tests for the RCCL (ROCm Communication Collectives Library) CSV Tuner Plugin. The test suite validates the functionality of the CSV-based tuning plugin across different collective operations (AllReduce, Broadcast, Reduce, AllGather, and ReduceScatter) and various configuration scenarios. + +The tests are written in Python using the pytest framework, making it easy to run, maintain, and extend the test coverage. + +## Directory Structure + +``` +ext-plugins/ +├── README.md # This file - documentation for the test suite +├── requirements.txt # Python dependencies required for testing +├── pytest.ini # Pytest configuration and test markers +├── .gitignore # Git ignore rules for Python/pytest artifacts +├── venv/ # Python virtual environment (created after setup) +├── logs/ # Test execution logs and output files +├── assets/ # Test configuration files and assets +│ └── csv_confs/ # CSV configuration files for testing +│ ├── incorrect_values_config.conf +│ ├── multinode_config.conf +│ ├── no_matching_config.conf +│ ├── singlenode_config.conf +│ ├── unsupported_algo_proto_config.conf +│ ├── valid_config_with_wildcards.conf +│ └── valid_config_without_wildcards.conf +└── tests/ # Test suite directory + ├── conftest.py # Pytest fixtures and shared test configuration + └── ext-tuner/ # CSV Tuner Plugin specific tests + ├── test_allreduce.py + ├── test_broadcast.py + ├── test_reduce.py + ├── test_allgather.py + └── test_reducescatter.py +``` + +## Installation & Setup + +### Prerequisites + +- Python 3.6 or higher +- RCCL library installed +- ROCm environment configured +- **Important**: Update the installation paths in `tests/conftest.py` to match your environment: + +```python +RCCL_INSTALL_DIR = "path/to/rccl" +OMPI_INSTALL_DIR = "path/to/ompi" +RCCL_TESTS_DIR = "path/to/rccl-tests" +``` + +Replace these placeholder paths with your actual installation directories before running the tests. + +### Building the RCCL CSV Tuner Plugin + +Before running the tests, you need to build the RCCL CSV tuner plugin library. The plugin is located in the `ext-tuner/example` directory. + +#### Step 1: Navigate to the plugin directory + +```bash +cd rccl/ext-tuner/example +``` + +#### Step 2: Build the plugin + +```bash +make +``` + +This will compile the plugin and create `libnccl-tuner-example.so` in the same directory. + +### Step 1: Create Virtual Environment + +Create a Python virtual environment to isolate the test dependencies: + +```bash +python3 -m venv venv +``` + +### Step 2: Activate Virtual Environment + +Activate the virtual environment using the appropriate command for your shell: + +**On Linux:** +```bash +source venv/bin/activate +``` + +Once activated, you should see `(venv)` at the beginning of your command prompt. + +### Step 3: Install Dependencies + +Install the required Python packages: + +```bash +pip install -r requirements.txt +``` + +## Running Tests + +### Run All Tests + +To run the entire test suite: + +```bash +pytest --cache-clear +``` + +### Run Tests with Verbose Output + +For more detailed test output: + +```bash +pytest -v --cache-clear +``` + +### Run Tests by Marker + +Tests are organized using pytest markers. You can run specific groups of tests: + +**Run CSV Plugin tests:** +```bash +pytest -m mark.ext_tuner --cache-clear +``` + +**Run tests for specific collective operations:** +```bash +pytest -m allreduce --cache-clear # AllReduce tests +pytest -m broadcast --cache-clear # Broadcast tests +pytest -m reduce --cache-clear # Reduce tests +pytest -m allgather --cache-clear # AllGather tests +pytest -m reducescatter --cache-clear # ReduceScatter tests +``` + +### Run Tests with Log Output + +To see live log output during test execution: + +```bash +pytest -v -s --cache-clear +``` + +### Generate Test Report + +To generate a detailed test report: + +```bash +pytest --verbose --tb=short +``` + +## Additional Notes + +- **Deactivating Virtual Environment**: When you're done testing, deactivate the virtual environment by running: + ```bash + deactivate + ``` + +- **Log Files**: Test execution logs are stored in the `logs/` directory for later review and debugging. diff --git a/test/ext-plugins/assets/csv_confs/incorrect_values_config.conf b/test/ext-plugins/assets/csv_confs/incorrect_values_config.conf new file mode 100644 index 0000000000..27dd9231bd --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/incorrect_values_config.conf @@ -0,0 +1,61 @@ +# Test configuration with invalid/incorrect values to test plugin robustness - 8B-128M +# Invalid collective type (should default to allreduce) +wrongtype,8,65536,ring,simple,-1,-1,-1,-1,-1 +# Invalid algorithm (should default to ring) +allreduce,65537,16777216,badring,simple,-1,-1,-1,-1,-1 +# Invalid protocol (should default to simple) +allreduce,16777217,67108864,tree,badproto,-1,-1,-1,-1,-1 +# Invalid numbers (letters instead of numbers) +allreduce,abc,def,ring,simple,xyz,-1,-1,-1,-1 +# Wrong field count (should be ignored completely) +allreduce,8,128M,ring +# Mixed valid and invalid +allreduce,67108865,134217728,ring,simple,2,-1,-1,-1,-1 + +# Broadcast configurations with invalid/incorrect values - 8B-128M +# Invalid algorithm (should default to ring for broadcast) +broadcast,8,65536,badring,simple,-1,-1,-1,-1,-1 +# Invalid protocol (should default to simple) +broadcast,65537,16777216,ring,badproto,-1,-1,-1,-1,-1 +# Invalid numbers (letters instead of numbers) +broadcast,abc,def,ring,simple,xyz,-1,-1,-1,-1 +# Wrong field count (should be ignored completely) +broadcast,8,128M,ring +# Mixed valid and invalid +broadcast,16777217,134217728,ring,simple,4,-1,-1,-1,-1 + +# AllGather configurations with invalid/incorrect values - 8B-128M +# Invalid algorithm (should default to ring for allgather) +allgather,8,65536,badring,simple,-1,-1,-1,-1,-1 +# Invalid protocol (should default to simple) +allgather,65537,16777216,ring,badproto,-1,-1,-1,-1,-1 +# Invalid numbers (letters instead of numbers) +allgather,abc,def,ring,simple,xyz,-1,-1,-1,-1 +# Wrong field count (should be ignored completely) +allgather,8,128M,ring +# Mixed valid and invalid +allgather,16777217,134217728,ring,simple,4,-1,-1,-1,-1 + +# Reduce configurations with invalid/incorrect values - 8B-128M +# Invalid algorithm (should default to ring for reduce) +reduce,8,65536,badring,simple,-1,-1,-1,-1,-1 +# Invalid protocol (should default to simple) +reduce,65537,16777216,ring,badproto,-1,-1,-1,-1,-1 +# Invalid numbers (letters instead of numbers) +reduce,abc,def,ring,simple,xyz,-1,-1,-1,-1 +# Wrong field count (should be ignored completely) +reduce,8,128M,ring +# Mixed valid and invalid +reduce,16777217,134217728,ring,simple,4,-1,-1,-1,-1 + +# ReduceScatter configurations with invalid/incorrect values - 8B-128M +# Invalid algorithm (should default to ring for reducescatter) +reducescatter,8,65536,badring,simple,-1,-1,-1,-1,-1 +# Invalid protocol (should default to simple) +reducescatter,65537,16777216,ring,badproto,-1,-1,-1,-1,-1 +# Invalid numbers (letters instead of numbers) +reducescatter,abc,def,ring,simple,xyz,-1,-1,-1,-1 +# Wrong field count (should be ignored completely) +reducescatter,8,128M,ring +# Mixed valid and invalid +reducescatter,16777217,134217728,ring,simple,4,-1,-1,-1,-1 diff --git a/test/ext-plugins/assets/csv_confs/multinode_config.conf b/test/ext-plugins/assets/csv_confs/multinode_config.conf new file mode 100644 index 0000000000..e18b6fcd2d --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/multinode_config.conf @@ -0,0 +1,104 @@ +# AllReduce configurations for multi-node setups - 8B-128M +# 2 nodes, 16 ranks total - 8B-128M +allreduce,8,65536,tree,ll,4,2,16,-1,-1 # Small: tree (8B to 64K) +allreduce,65537,16777216,ring,ll128,6,2,16,-1,-1 # Medium: ring (64K to 16M) +allreduce,16777217,134217728,ring,simple,8,2,16,-1,-1 # Large: ring/simple (16M to 128M) + +# 3 nodes, 24 ranks total - 8B-128M +allreduce,8,65536,tree,ll,4,2,24,-1,-1 # Small: tree (8B to 64K) +allreduce,65537,16777216,ring,ll128,6,2,24,-1,-1 # Medium: ring (64K to 16M) +allreduce,16777217,134217728,ring,simple,8,2,24,-1,-1 # Large: ring/simple (16M to 128M) + +# 4 nodes, 32 ranks total - 8B-128M +allreduce,8,65536,tree,ll,4,2,32,-1,-1 # Small: tree (8B to 64K) +allreduce,65537,16777216,ring,ll128,6,2,32,-1,-1 # Medium: ring (64K to 16M) +allreduce,16777217,134217728,ring,simple,8,2,32,-1,-1 # Large: ring/simple (16M to 128M) + +# 5 nodes, 40 ranks total - 8B-128M +allreduce,8,65536,tree,ll,4,2,40,-1,-1 # Small: tree (8B to 64K) +allreduce,65537,16777216,ring,ll128,6,2,40,-1,-1 # Medium: ring (64K to 16M) +allreduce,16777217,134217728,ring,simple,8,2,40,-1,-1 # Large: ring/simple (16M to 128M) + +# Broadcast configurations for multi-node setups - 8B-128M +# 2 nodes, 16 ranks total +broadcast,8,65536,ring,ll,4,2,16,-1,-1 # Small: ring (8B to 64K) +broadcast,65537,16777216,ring,ll128,6,2,16,-1,-1 # Medium: ring (64K to 16M) +broadcast,16777217,134217728,ring,simple,8,2,16,-1,-1 # Large: ring/simple (16M to 128M) + +# 3 nodes, 24 ranks total +broadcast,8,65536,ring,ll,4,2,24,-1,-1 # Small: ring (8B to 64K) +broadcast,65537,16777216,ring,ll128,6,2,24,-1,-1 # Medium: ring (64K to 16M) +broadcast,16777217,134217728,ring,simple,8,2,24,-1,-1 # Large: ring/simple (16M to 128M) + +# 4 nodes, 32 ranks total +broadcast,8,65536,ring,ll,4,2,32,-1,-1 # Small: ring (8B to 64K) +broadcast,65537,16777216,ring,ll128,6,2,32,-1,-1 # Medium: ring (64K to 16M) +broadcast,16777217,134217728,ring,simple,8,2,32,-1,-1 # Large: ring/simple (16M to 128M) + +# 5 nodes, 40 ranks total +broadcast,8,65536,ring,ll,4,2,40,-1,-1 # Small: ring (8B to 64K) +broadcast,65537,16777216,ring,ll128,6,2,40,-1,-1 # Medium: ring (64K to 16M) +broadcast,16777217,134217728,ring,simple,8,2,40,-1,-1 # Large: ring/simple (16M to 128M) + +# AllGather configurations for multi-node setups - 8B-128M +# 2 nodes, 16 ranks total +allgather,8,65536,ring,ll,4,2,16,-1,-1 # Small: ring (8B to 64K) +allgather,65537,16777216,ring,ll128,6,2,16,-1,-1 # Medium: ring (64K to 16M) +allgather,16777217,134217728,ring,simple,8,2,16,-1,-1 # Large: ring/simple (16M to 128M) + +# 3 nodes, 24 ranks total +allgather,8,65536,ring,ll,4,2,24,-1,-1 # Small: ring (8B to 64K) +allgather,65537,16777216,ring,ll128,6,2,24,-1,-1 # Medium: ring (64K to 16M) +allgather,16777217,134217728,ring,simple,8,2,24,-1,-1 # Large: ring/simple (16M to 128M) + +# 4 nodes, 32 ranks total +allgather,8,65536,ring,ll,4,2,32,-1,-1 # Small: ring (8B to 64K) +allgather,65537,16777216,ring,ll128,6,2,32,-1,-1 # Medium: ring (64K to 16M) +allgather,16777217,134217728,ring,simple,8,2,32,-1,-1 # Large: ring/simple (16M to 128M) + +# 5 nodes, 40 ranks total +allgather,8,65536,ring,ll,4,2,40,-1,-1 # Small: ring (8B to 64K) +allgather,65537,16777216,ring,ll128,6,2,40,-1,-1 # Medium: ring (64K to 16M) +allgather,16777217,134217728,ring,simple,8,2,40,-1,-1 # Large: ring/simple (16M to 128M) + +# Reduce configurations for multi-node setups - 8B-128M +# 2 nodes, 16 ranks total +reduce,8,65536,ring,ll,4,2,16,-1,-1 # Small: ring (8B to 64K) +reduce,65537,16777216,ring,ll128,6,2,16,-1,-1 # Medium: ring (64K to 16M) +reduce,16777217,134217728,ring,simple,8,2,16,-1,-1 # Large: ring/simple (16M to 128M) + +# 3 nodes, 24 ranks total +reduce,8,65536,ring,ll,4,2,24,-1,-1 # Small: ring (8B to 64K) +reduce,65537,16777216,ring,ll128,6,2,24,-1,-1 # Medium: ring (64K to 16M) +reduce,16777217,134217728,ring,simple,8,2,24,-1,-1 # Large: ring/simple (16M to 128M) + +# 4 nodes, 32 ranks total +reduce,8,65536,ring,ll,4,2,32,-1,-1 # Small: ring (8B to 64K) +reduce,65537,16777216,ring,ll128,6,2,32,-1,-1 # Medium: ring (64K to 16M) +reduce,16777217,134217728,ring,simple,8,2,32,-1,-1 # Large: ring/simple (16M to 128M) + +# 5 nodes, 40 ranks total +reduce,8,65536,ring,ll,4,2,40,-1,-1 # Small: ring (8B to 64K) +reduce,65537,16777216,ring,ll128,6,2,40,-1,-1 # Medium: ring (64K to 16M) +reduce,16777217,134217728,ring,simple,8,2,40,-1,-1 # Large: ring/simple (16M to 128M) + +# ReduceScatter configurations for multi-node setups - 8B-128M +# 2 nodes, 16 ranks total +reducescatter,8,65536,ring,ll,4,2,16,-1,-1 # Small: ring (8B to 64K) +reducescatter,65537,16777216,ring,ll128,6,2,16,-1,-1 # Medium: ring (64K to 16M) +reducescatter,16777217,134217728,ring,simple,8,2,16,-1,-1 # Large: ring/simple (16M to 128M) + +# 3 nodes, 24 ranks total +reducescatter,8,65536,ring,ll,4,2,24,-1,-1 # Small: ring (8B to 64K) +reducescatter,65537,16777216,ring,ll128,6,2,24,-1,-1 # Medium: ring (64K to 16M) +reducescatter,16777217,134217728,ring,simple,8,2,24,-1,-1 # Large: ring/simple (16M to 128M) + +# 4 nodes, 32 ranks total +reducescatter,8,65536,ring,ll,4,2,32,-1,-1 # Small: ring (8B to 64K) +reducescatter,65537,16777216,ring,ll128,6,2,32,-1,-1 # Medium: ring (64K to 16M) +reducescatter,16777217,134217728,ring,simple,8,2,32,-1,-1 # Large: ring/simple (16M to 128M) + +# 5 nodes, 40 ranks total +reducescatter,8,65536,ring,ll,4,2,40,-1,-1 # Small: ring (8B to 64K) +reducescatter,65537,16777216,ring,ll128,6,2,40,-1,-1 # Medium: ring (64K to 16M) +reducescatter,16777217,134217728,ring,simple,8,2,40,-1,-1 # Large: ring/simple (16M to 128M) diff --git a/test/ext-plugins/assets/csv_confs/no_matching_config.conf b/test/ext-plugins/assets/csv_confs/no_matching_config.conf new file mode 100644 index 0000000000..ea857ef41f --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/no_matching_config.conf @@ -0,0 +1,44 @@ +# Invalid configuration that should NOT match the test environment +# Wrong collective type (broadcast instead of allreduce for allreduce tests) - 8B-128M +broadcast,0,7,ring,simple,-1,-1,-1,-1,-1 # Size too small (broadcast test starts at 8B) +broadcast,134217729,268435456,ring,simple,-1,-1,-1,-1,-1 # Size too large (broadcast test ends at 128M) +# Wrong collective type (allreduce instead of broadcast for broadcast tests) - 8B-128M +allreduce,0,7,tree,simple,-1,-1,-1,-1,-1 # Size too small (doesn't include 8B+) +allreduce,134217729,268435456,tree,simple,-1,-1,-1,-1,-1 # Size too large (doesn't include 8B-128M) +# Wrong collective type (allgather instead of other tests) - 8B-128M +allgather,0,7,ring,simple,-1,-1,-1,-1,-1 # Size too small (allgather test starts at 8B) +allgather,134217729,268435456,ring,simple,-1,-1,-1,-1,-1 # Size too large (allgather test ends at 128M) +# Wrong collective type (reduce instead of other tests) - 8B-128M +reduce,0,7,ring,simple,-1,-1,-1,-1,-1 # Size too small (reduce test starts at 8B) +reduce,134217729,268435456,ring,simple,-1,-1,-1,-1,-1 # Size too large (reduce test ends at 128M) +# Wrong collective type (reducescatter instead of other tests) - 8B-128M +reducescatter,0,7,ring,simple,-1,-1,-1,-1,-1 # Size too small (reducescatter test starts at 8B) +reducescatter,134217729,268435456,ring,simple,-1,-1,-1,-1,-1 # Size too large (reducescatter test ends at 128M) + +# Wrong size ranges (tests use 8 bytes to 128M, these are way outside) +allreduce,2000000000,4000000000,ring,simple,-1,-1,-1,-1,-1 +allreduce,5000000000,6000000000,tree,ll128,-1,-1,-1,-1,-1 +broadcast,2000000000,4000000000,ring,simple,-1,-1,-1,-1,-1 +broadcast,5000000000,6000000000,ring,ll128,-1,-1,-1,-1,-1 +allgather,2000000000,4000000000,ring,simple,-1,-1,-1,-1,-1 +allgather,5000000000,6000000000,ring,ll128,-1,-1,-1,-1,-1 +reduce,2000000000,4000000000,ring,simple,-1,-1,-1,-1,-1 +reduce,5000000000,6000000000,ring,ll128,-1,-1,-1,-1,-1 +reducescatter,2000000000,4000000000,ring,simple,-1,-1,-1,-1,-1 +reducescatter,5000000000,6000000000,ring,ll128,-1,-1,-1,-1,-1 + +# Wrong node/rank counts (broadcast tests use 1 node, 4/8 ranks - these specify different, size outside 8B-128M) +broadcast,134217729,268435456,ring,simple,-1,4,16,-1,-1 # 4 nodes, 16 ranks, size too large +broadcast,268435457,536870912,ring,simple,-1,2,4,-1,-1 # 2 nodes, 4 ranks, size too large +# Wrong node/rank counts (allreduce tests use 1 node, 4/8 ranks - these specify different, size outside 8B-128M) +allreduce,134217729,268435456,ring,simple,-1,8,16,-1,-1 # 8 nodes, 16 ranks, size too large +allreduce,268435457,536870912,tree,simple,-1,2,8,-1,-1 # 2 nodes, 8 ranks, size too large +# Wrong node/rank counts (allgather tests use 1 node, 8 ranks - these specify different, size outside 8B-128M) +allgather,134217729,268435456,ring,simple,-1,4,16,-1,-1 # 4 nodes, 16 ranks, size too large +allgather,268435457,536870912,ring,simple,-1,2,4,-1,-1 # 2 nodes, 4 ranks, size too large +# Wrong node/rank counts (reduce tests use 1 node, 8 ranks - these specify different, size outside 8B-128M) +reduce,134217729,268435456,ring,simple,-1,4,16,-1,-1 # 4 nodes, 16 ranks, size too large +reduce,268435457,536870912,ring,simple,-1,2,4,-1,-1 # 2 nodes, 4 ranks, size too large +# Wrong node/rank counts (reducescatter tests use 1 node, 8 ranks - these specify different, size outside 8B-128M) +reducescatter,134217729,268435456,ring,simple,-1,4,16,-1,-1 # 4 nodes, 16 ranks, size too large +reducescatter,268435457,536870912,ring,simple,-1,2,4,-1,-1 # 2 nodes, 4 ranks, size too large diff --git a/test/ext-plugins/assets/csv_confs/singlenode_config.conf b/test/ext-plugins/assets/csv_confs/singlenode_config.conf new file mode 100644 index 0000000000..88453a306a --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/singlenode_config.conf @@ -0,0 +1,39 @@ +# Single-node allreduce configuration for 8 ranks, 1 node - use working algorithm/protocol combinations - 8B-128M +# Small allreduce: tree/ll (8 bytes to 64K) +allreduce,8,65536,tree,ll,2,1,8,-1,-1 +# Medium allreduce: ring/ll128 (64K to 16M) +allreduce,65537,16777216,ring,ll128,4,1,8,-1,-1 +# Large allreduce: ring/simple (16M to 128M) +allreduce,16777217,134217728,ring,simple,6,1,8,-1,-1 + +# Single-node broadcast configuration for 8 ranks, 1 node - use ring algorithm (supported for broadcast) - 8B-128M +# Small broadcast: ring/ll (8 bytes to 64K) +broadcast,8,65536,ring,ll,2,1,8,-1,-1 +# Medium broadcast: ring/ll128 (64K to 16M) +broadcast,65537,16777216,ring,ll128,4,1,8,-1,-1 +# Large broadcast: ring/simple (16M to 128M) +broadcast,16777217,134217728,ring,simple,6,1,8,-1,-1 + +# Single-node allgather configuration for 8 ranks, 1 node - use ring algorithm (supported for allgather) - 8B-128M +# Small allgather: ring/ll (8 bytes to 64K) +allgather,8,65536,ring,ll,2,1,8,-1,-1 +# Medium allgather: ring/ll128 (64K to 16M) +allgather,65537,16777216,ring,ll128,4,1,8,-1,-1 +# Large allgather: ring/simple (16M to 128M) +allgather,16777217,134217728,ring,simple,6,1,8,-1,-1 + +# Single-node reduce configuration for 8 ranks, 1 node - use ring algorithm (supported for reduce) - 8B-128M +# Small reduce: ring/ll (8 bytes to 64K) +reduce,8,65536,ring,ll,2,1,8,-1,-1 +# Medium reduce: ring/ll128 (64K to 16M) +reduce,65537,16777216,ring,ll128,4,1,8,-1,-1 +# Large reduce: ring/simple (16M to 128M) +reduce,16777217,134217728,ring,simple,6,1,8,-1,-1 + +# Single-node reducescatter configuration for 8 ranks, 1 node - use ring algorithm (supported for reducescatter) - 8B-128M +# Small reducescatter: ring/ll (8 bytes to 64K) +reducescatter,8,65536,ring,ll,2,1,8,-1,-1 +# Medium reducescatter: ring/ll128 (64K to 16M) +reducescatter,65537,16777216,ring,ll128,4,1,8,-1,-1 +# Large reducescatter: ring/simple (16M to 128M) +reducescatter,16777217,134217728,ring,simple,6,1,8,-1,-1 \ No newline at end of file diff --git a/test/ext-plugins/assets/csv_confs/unsupported_algo_proto_config.conf b/test/ext-plugins/assets/csv_confs/unsupported_algo_proto_config.conf new file mode 100644 index 0000000000..b1a0dd1213 --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/unsupported_algo_proto_config.conf @@ -0,0 +1,40 @@ +# AllReduce configurations with unsupported algorithm/protocol combinations - 8B-128M +# These have valid size ranges and collective types but unsupported algorithms +allreduce,8,65536,collnet_direct,simple,2,-1,-1,-1,-1 +allreduce,65537,8388608,collnet_chain,ll,4,-1,-1,-1,-1 +allreduce,8388609,33554432,nvls,simple,-1,-1,-1,-1,-1 +allreduce,33554433,67108864,nvls_tree,ll128,-1,-1,-1,-1,-1 +allreduce,67108865,100663296,pat,simple,-1,-1,-1,-1,-1 +allreduce,100663297,134217728,collnet_direct,ll128,-1,-1,-1,-1,-1 + +# Broadcast configurations with unsupported algorithm/protocol combinations - 8B-128M +broadcast,8,65536,collnet_direct,simple,2,-1,-1,-1,-1 +broadcast,65537,8388608,collnet_chain,ll,4,-1,-1,-1,-1 +broadcast,8388609,33554432,nvls,simple,-1,-1,-1,-1,-1 +broadcast,33554433,67108864,nvls_tree,ll128,-1,-1,-1,-1,-1 +broadcast,67108865,100663296,pat,simple,-1,-1,-1,-1,-1 +broadcast,100663297,134217728,collnet_direct,ll128,-1,-1,-1,-1,-1 + +# AllGather configurations with unsupported algorithm/protocol combinations - 8B-128M +allgather,8,65536,collnet_direct,simple,2,-1,-1,-1,-1 +allgather,65537,8388608,collnet_chain,ll,4,-1,-1,-1,-1 +allgather,8388609,33554432,nvls,simple,-1,-1,-1,-1,-1 +allgather,33554433,67108864,nvls_tree,ll128,-1,-1,-1,-1,-1 +allgather,67108865,100663296,pat,simple,-1,-1,-1,-1,-1 +allgather,100663297,134217728,collnet_direct,ll128,-1,-1,-1,-1,-1 + +# Reduce configurations with unsupported algorithm/protocol combinations - 8B-128M +reduce,8,65536,collnet_direct,simple,2,-1,-1,-1,-1 +reduce,65537,8388608,collnet_chain,ll,4,-1,-1,-1,-1 +reduce,8388609,33554432,nvls,simple,-1,-1,-1,-1,-1 +reduce,33554433,67108864,nvls_tree,ll128,-1,-1,-1,-1,-1 +reduce,67108865,100663296,pat,simple,-1,-1,-1,-1,-1 +reduce,100663297,134217728,collnet_direct,ll128,-1,-1,-1,-1,-1 + +# ReduceScatter configurations with unsupported algorithm/protocol combinations - 8B-128M +reducescatter,8,65536,collnet_direct,simple,2,-1,-1,-1,-1 +reducescatter,65537,8388608,collnet_chain,ll,4,-1,-1,-1,-1 +reducescatter,8388609,33554432,nvls,simple,-1,-1,-1,-1,-1 +reducescatter,33554433,67108864,nvls_tree,ll128,-1,-1,-1,-1,-1 +reducescatter,67108865,100663296,pat,simple,-1,-1,-1,-1,-1 +reducescatter,100663297,134217728,collnet_direct,ll128,-1,-1,-1,-1,-1 diff --git a/test/ext-plugins/assets/csv_confs/valid_config_with_wildcards.conf b/test/ext-plugins/assets/csv_confs/valid_config_with_wildcards.conf new file mode 100644 index 0000000000..7c89b667be --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/valid_config_with_wildcards.conf @@ -0,0 +1,39 @@ +# AllReduce configurations with supported algo/proto combinations - 8B-128M +# Small allreduce: tree/ll (8 bytes to 64K) +allreduce,8,65536,tree,ll,2,-1,-1,-1,-1 +# Medium allreduce: ring/ll128 (64K to 16M) +allreduce,65537,16777216,ring,ll128,4,-1,-1,-1,-1 +# Large allreduce: ring/simple (16M to 128M) +allreduce,16777217,134217728,ring,simple,-1,-1,-1,-1,-1 + +# Broadcast configurations with supported algo/proto combinations - 8B-128M +# Small broadcast: ring/ll (8 bytes to 64K) +broadcast,8,65536,ring,ll,2,-1,-1,-1,-1 +# Medium broadcast: ring/ll128 (64K to 16M) +broadcast,65537,16777216,ring,ll128,4,-1,-1,-1,-1 +# Large broadcast: ring/simple (16M to 128M) +broadcast,16777217,134217728,ring,simple,-1,-1,-1,-1,-1 + +# AllGather configurations with supported algo/proto combinations - 8B-128M +# Small allgather: ring/ll (8 bytes to 64K) +allgather,8,65536,ring,ll,2,-1,-1,-1,-1 +# Medium allgather: ring/ll128 (64K to 16M) +allgather,65537,16777216,ring,ll128,4,-1,-1,-1,-1 +# Large allgather: ring/simple (16M to 128M) +allgather,16777217,134217728,ring,simple,-1,-1,-1,-1,-1 + +# Reduce configurations with supported algo/proto combinations - 8B-128M +# Small reduce: ring/ll (8 bytes to 64K) +reduce,8,65536,ring,ll,2,-1,-1,-1,-1 +# Medium reduce: ring/ll128 (64K to 16M) +reduce,65537,16777216,ring,ll128,4,-1,-1,-1,-1 +# Large reduce: ring/simple (16M to 128M) +reduce,16777217,134217728,ring,simple,-1,-1,-1,-1,-1 + +# ReduceScatter configurations with supported algo/proto combinations - 8B-128M +# Small reducescatter: ring/ll (8 bytes to 64K) +reducescatter,8,65536,ring,ll,2,-1,-1,-1,-1 +# Medium reducescatter: ring/ll128 (64K to 16M) +reducescatter,65537,16777216,ring,ll128,4,-1,-1,-1,-1 +# Large reducescatter: ring/simple (16M to 128M) +reducescatter,16777217,134217728,ring,simple,-1,-1,-1,-1,-1 diff --git a/test/ext-plugins/assets/csv_confs/valid_config_without_wildcards.conf b/test/ext-plugins/assets/csv_confs/valid_config_without_wildcards.conf new file mode 100644 index 0000000000..16ef738d38 --- /dev/null +++ b/test/ext-plugins/assets/csv_confs/valid_config_without_wildcards.conf @@ -0,0 +1,40 @@ +# AllReduce configurations with specific nodes/ranks values (no wildcards for topology) - 8B-128M +# But use wildcards for pipeOps/regBuff since they vary at runtime +# Small allreduce: tree/ll, 2 channels, 1 node, 4 ranks, any pipeOps, any regBuff (8 bytes to 64K) +allreduce,8,65536,tree,ll,2,1,4,-1,-1 +# Medium allreduce: ring/ll128, 4 channels, 1 node, 4 ranks, any pipeOps, any regBuff (64K to 16M) +allreduce,65537,16777216,ring,ll128,4,1,4,-1,-1 +# Large allreduce: ring/simple, 6 channels, 1 node, 4 ranks, any pipeOps, any regBuff (16M to 128M) +allreduce,16777217,134217728,ring,simple,6,1,4,-1,-1 + +# Broadcast configurations with specific nodes/ranks values (no wildcards for topology) - 8B-128M +# Small broadcast: ring/ll, 2 channels, 1 node, 4 ranks, any pipeOps, any regBuff (8 bytes to 64K) +broadcast,8,65536,ring,ll,2,1,4,-1,-1 +# Medium broadcast: ring/ll128, 4 channels, 1 node, 4 ranks, any pipeOps, any regBuff (64K to 16M) +broadcast,65537,16777216,ring,ll128,4,1,4,-1,-1 +# Large broadcast: ring/simple, 6 channels, 1 node, 4 ranks, any pipeOps, any regBuff (16M to 128M) +broadcast,16777217,134217728,ring,simple,6,1,4,-1,-1 + +# AllGather configurations with specific nodes/ranks values (no wildcards for topology) - 8B-128M +# Small allgather: ring/ll, 2 channels, 1 node, 4 ranks, any pipeOps, any regBuff (8 bytes to 64K) +allgather,8,65536,ring,ll,2,1,4,-1,-1 +# Medium allgather: ring/ll128, 4 channels, 1 node, 4 ranks, any pipeOps, any regBuff (64K to 16M) +allgather,65537,16777216,ring,ll128,4,1,4,-1,-1 +# Large allgather: ring/simple, 6 channels, 1 node, 4 ranks, any pipeOps, any regBuff (16M to 128M) +allgather,16777217,134217728,ring,simple,6,1,4,-1,-1 + +# Reduce configurations with specific nodes/ranks values (no wildcards for topology) - 8B-128M +# Small reduce: ring/ll, 2 channels, 1 node, 4 ranks, any pipeOps, any regBuff (8 bytes to 64K) +reduce,8,65536,ring,ll,2,1,4,-1,-1 +# Medium reduce: ring/ll128, 4 channels, 1 node, 4 ranks, any pipeOps, any regBuff (64K to 16M) +reduce,65537,16777216,ring,ll128,4,1,4,-1,-1 +# Large reduce: ring/simple, 6 channels, 1 node, 4 ranks, any pipeOps, any regBuff (16M to 128M) +reduce,16777217,134217728,ring,simple,6,1,4,-1,-1 + +# ReduceScatter configurations with specific nodes/ranks values (no wildcards for topology) - 8B-128M +# Small reducescatter: ring/ll, 2 channels, 1 node, 4 ranks, any pipeOps, any regBuff (8 bytes to 64K) +reducescatter,8,65536,ring,ll,2,1,4,-1,-1 +# Medium reducescatter: ring/ll128, 4 channels, 1 node, 4 ranks, any pipeOps, any regBuff (64K to 16M) +reducescatter,65537,16777216,ring,ll128,4,1,4,-1,-1 +# Large reducescatter: ring/simple, 6 channels, 1 node, 4 ranks, any pipeOps, any regBuff (16M to 128M) +reducescatter,16777217,134217728,ring,simple,6,1,4,-1,-1 diff --git a/test/ext-plugins/pytest.ini b/test/ext-plugins/pytest.ini new file mode 100644 index 0000000000..61ad276a54 --- /dev/null +++ b/test/ext-plugins/pytest.ini @@ -0,0 +1,12 @@ +# pytest.ini +[pytest] +markers = + ext_tuner: marks tests related to CSV Tuner Plugin + allreduce: marks tests related to AllReduce collective + broadcast: marks tests related to Broadcast collective + reduce: marks tests related to Reduce collective + allgather: marks tests related to AllGather collective + reducescatter: marks tests related to ReduceScatter collective + multinode: marks tests related to multi-node configuration + +testpaths = tests \ No newline at end of file diff --git a/test/ext-plugins/requirements.txt b/test/ext-plugins/requirements.txt new file mode 100644 index 0000000000..936ed46450 --- /dev/null +++ b/test/ext-plugins/requirements.txt @@ -0,0 +1,2 @@ +# Testing dependencies +pytest \ No newline at end of file diff --git a/test/ext-plugins/tests/conftest.py b/test/ext-plugins/tests/conftest.py new file mode 100644 index 0000000000..a2b02837c9 --- /dev/null +++ b/test/ext-plugins/tests/conftest.py @@ -0,0 +1,109 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import pytest +import subprocess +import re +from types import SimpleNamespace + +WORKDIR = os.getcwd() + +RCCL_INSTALL_DIR = "path/to/rccl" +OMPI_INSTALL_DIR = "path/to/ompi/install" +RCCL_TESTS_DIR = "path/to/rccl-tests" + +# Plugin Paths +PLUGIN_DIR = f"{RCCL_INSTALL_DIR}/ext-tuner/example" +PLUGIN_SO = f"{PLUGIN_DIR}/libnccl-tuner-example.so" + +# CSV Configs +VALID_CONFIG_WITH_WILDCARDS = os.path.join(WORKDIR, "assets/csv_confs/valid_config_with_wildcards.conf") +VALID_CONFIG_WITHOUT_WILDCARDS = os.path.join(WORKDIR, "assets/csv_confs/valid_config_without_wildcards.conf") +NO_MATCHING_CONFIG = os.path.join(WORKDIR, "assets/csv_confs/no_matching_config.conf") +INCORRECT_VALUES_CONFIG = os.path.join(WORKDIR, "assets/csv_confs/incorrect_values_config.conf") +UNSUPPORTED_ALGO_PROTO_CONFIG = os.path.join(WORKDIR, "assets/csv_confs/unsupported_algo_proto_config.conf") +SINGLENODE_CONFIG = os.path.join(WORKDIR, "assets/csv_confs/singlenode_config.conf") +MULTINODE_CONFIG = os.path.join(WORKDIR, "assets/csv_confs/multinode_config.conf") + +LOGDIR = os.path.join(WORKDIR, "logs") +os.makedirs(LOGDIR, exist_ok=True) + +# Helper Functions +def get_avg_bus_bandwidth(log_content: str): + """Extract average bus bandwidth from RCCL test log""" + pattern = r'#\s*Avg bus bandwidth\s*:\s*([\d.]+)' + match = re.search(pattern, log_content, re.IGNORECASE) + return float(match.group(1)) if match else None + +def check_node_interface(node: str, interface: str) -> bool: + """Check if a node has the specified interface with an IP address""" + try: + cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", + node, f"ip addr show {interface} | grep 'inet ' | wc -l"] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) + return result.returncode == 0 and int(result.stdout.strip()) > 0 + except (subprocess.CalledProcessError, subprocess.TimeoutExpired, ValueError): + return False + +def find_common_interface(nodelist): + """Find a common network interface across all nodes""" + interfaces_to_check = ["eth0", "eth1"] + + for interface in interfaces_to_check: + all_nodes_have_interface = True + for node in nodelist: + if not check_node_interface(node, interface): + all_nodes_have_interface = False + break + if all_nodes_have_interface: + return interface + return None + +def get_available_nodes(): + """Get available nodes from SLURM environment""" + try: + # Get available nodes + result = subprocess.run( + ["scontrol", "show", "hostnames"], + capture_output=True, + text=True, + check=True + ) + nodelist = result.stdout.strip().split('\n') + nodelist = [node.strip() for node in nodelist if node.strip()] + + return nodelist + + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + +# Pytest Fixture +@pytest.fixture(scope="session") +def paths(): + return SimpleNamespace( + # Paths + WORKDIR=WORKDIR, + RCCL_INSTALL_DIR=RCCL_INSTALL_DIR, + OMPI_INSTALL_DIR=OMPI_INSTALL_DIR, + PLUGIN_DIR=PLUGIN_DIR, + PLUGIN_SO=PLUGIN_SO, + RCCL_TESTS_DIR=RCCL_TESTS_DIR, + # CSV Configs + VALID_CONFIG_WITH_WILDCARDS=VALID_CONFIG_WITH_WILDCARDS, + VALID_CONFIG_WITHOUT_WILDCARDS=VALID_CONFIG_WITHOUT_WILDCARDS, + NO_MATCHING_CONFIG=NO_MATCHING_CONFIG, + INCORRECT_VALUES_CONFIG=INCORRECT_VALUES_CONFIG, + UNSUPPORTED_ALGO_PROTO_CONFIG=UNSUPPORTED_ALGO_PROTO_CONFIG, + SINGLENODE_CONFIG=SINGLENODE_CONFIG, + MULTINODE_CONFIG=MULTINODE_CONFIG, + LOGDIR=LOGDIR, + # Helper Functions + get_avg_bus_bandwidth=get_avg_bus_bandwidth, + check_node_interface=check_node_interface, + find_common_interface=find_common_interface, + get_available_nodes=get_available_nodes, + ) \ No newline at end of file diff --git a/test/ext-plugins/tests/ext-tuner/test_allgather.py b/test/ext-plugins/tests/ext-tuner/test_allgather.py new file mode 100644 index 0000000000..8943a8e5d2 --- /dev/null +++ b/test/ext-plugins/tests/ext-tuner/test_allgather.py @@ -0,0 +1,440 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import subprocess +import pytest + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_valid_config_with_wildcards(paths): + """Test CSV plugin with wildcard values for matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITH_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_valid_config_with_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITH_WILDCARDS}" + + # Check that plugin applied valid configurations (test fails if no configs applied) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied valid configurations, but none were applied. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_valid_config_without_wildcards(paths): + """Test CSV plugin with specific values (no wildcards -1) for precise matching""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITHOUT_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_valid_config_without_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}" + + # With specific values, plugin should either apply matching configs or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + # Test should fail if no config is applied - we expect specific configs to match + assert plugin_applied, \ + f"Plugin should have applied at least one configuration from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_no_matching_config(paths): + """Test CSV plugin behavior with no matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.NO_MATCHING_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_no_matching_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.NO_MATCHING_CONFIG}" + + # Check that NO configurations were applied (they should not match the test environment) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert not plugin_applied, \ + f"Plugin should NOT have applied any configurations from {paths.NO_MATCHING_CONFIG} as they don't match the test environment. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_incorrect_values_config(paths): + """Test CSV plugin behavior with invalid/incorrect values in configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.INCORRECT_VALUES_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_incorrect_values_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded some configurations (plugin should handle invalid values gracefully) + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.INCORRECT_VALUES_CONFIG}" + + # Plugin should still function despite invalid values (using defaults) + # It might apply configs with default values or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + "Plugin should either apply configurations (with defaults) or report no matches" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_unsupported_algo_proto_config(paths): + """Test that plugin handles unsupported algorithm/protocol combinations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.UNSUPPORTED_ALGO_PROTO_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "64", + "-e", "1M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_unsupported_algo_proto.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.UNSUPPORTED_ALGO_PROTO_CONFIG}" + + # Check for unsupported combinations - should see IGNORE or out of bounds messages + ignored_combinations = "Algorithm/protocol combination" in log_content and "is marked as IGNORE" in log_content + out_of_bounds = "out of bounds" in log_content + + assert ignored_combinations or out_of_bounds, \ + f"Plugin should report unsupported algorithm/protocol combinations as IGNORE or out of bounds. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +def test_singlenode_config(paths): + """Test CSV plugin with single-node configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.SINGLENODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_singlenode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Single-node CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.SINGLENODE_CONFIG}" + + # Check that configurations were applied for single-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied single-node configurations from {paths.SINGLENODE_CONFIG}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allgather +@pytest.mark.multinode +def test_multinode_config(paths): + """Test CSV plugin with multi-node configuration""" + + try: + # Get available nodes + result = subprocess.run( + ["scontrol", "show", "hostnames"], + capture_output=True, + text=True, + check=True + ) + nodelist = result.stdout.strip().split('\n') + nodelist = [node.strip() for node in nodelist if node.strip()] + + print(f"Available nodes: {nodelist}") + + # Skip test if less than 2 nodes available + if len(nodelist) < 2: + pytest.skip(f"Multinode allgather test requires at least 2 nodes, but only {len(nodelist)} available: {nodelist}") + + except (subprocess.CalledProcessError, FileNotFoundError): + pytest.skip("Multinode allgather test requires SLURM environment (scontrol command not available)") + + # Check for common network interface across all nodes + common_interface = paths.find_common_interface(nodelist) + if common_interface is None: + pytest.skip(f"Multinode allgather test requires all nodes to have the same network interface (eth0 or eth1).") + + # Build host specification string (4 processes per node) + host_spec = ",".join([f"{node}:8" for node in nodelist]) + total_processes = len(nodelist) * 8 + print(f"Using host specification: {host_spec}") + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_IGNORE_CPU_AFFINITY": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.MULTINODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + "NCCL_SOCKET_IFNAME": common_interface, + "NCCL_DMABUF_ENABLE": "1", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", f"{total_processes}", + "--host", host_spec, + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_gather_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allgather_log_dir = os.path.join(paths.LOGDIR, "allgather_csv_plugin_test_logs") + os.makedirs(allgather_log_dir, exist_ok=True) + + log_file = os.path.join(allgather_log_dir, "test_allgather_multinode.log") + + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Multi-node CSV Plugin allgather test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.MULTINODE_CONFIG}" + + # Check that configurations were applied for multi-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied multi-node configurations from {paths.MULTINODE_CONFIG}. Check {log_file} for details" diff --git a/test/ext-plugins/tests/ext-tuner/test_allreduce.py b/test/ext-plugins/tests/ext-tuner/test_allreduce.py new file mode 100644 index 0000000000..0c85e73ff2 --- /dev/null +++ b/test/ext-plugins/tests/ext-tuner/test_allreduce.py @@ -0,0 +1,431 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import subprocess +import pytest + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_valid_config_with_wildcards(paths): + """Test CSV plugin with wildcard values for matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITH_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_valid_config_with_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITH_WILDCARDS}" + + # Check that plugin applied valid configurations (test fails if no configs applied) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied valid configurations, but none were applied. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_valid_config_without_wildcards(paths): + """Test CSV plugin with specific values (no wildcards -1) for precise matching""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITHOUT_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_valid_config_without_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}" + + # With specific values, plugin should either apply matching configs or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + # Test should fail if no config is applied - we expect specific configs to match + assert plugin_applied, \ + f"Plugin should have applied at least one configuration from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_no_matching_config(paths): + """Test CSV plugin behavior with no matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.NO_MATCHING_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--bind-to", "none", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_no_matching_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.NO_MATCHING_CONFIG}" + + # Check that NO configurations were applied (they should not match the test environment) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert not plugin_applied, \ + f"Plugin should NOT have applied any configurations from {paths.NO_MATCHING_CONFIG} as they don't match the test environment. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_incorrect_values_config(paths): + """Test CSV plugin behavior with invalid/incorrect values in configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.INCORRECT_VALUES_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_incorrect_values_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded some configurations (plugin should handle invalid values gracefully) + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.INCORRECT_VALUES_CONFIG}" + + # Plugin should still function despite invalid values (using defaults) + # It might apply configs with default values or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + "Plugin should either apply configurations (with defaults) or report no matches" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_unsupported_algo_proto_config(paths): + """Test that plugin handles unsupported algorithm/protocol combinations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.UNSUPPORTED_ALGO_PROTO_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_unsupported_algo_proto.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.UNSUPPORTED_ALGO_PROTO_CONFIG}" + + # Check for unsupported combinations - should see IGNORE or out of bounds messages + ignored_combinations = "Algorithm/protocol combination" in log_content and "is marked as IGNORE" in log_content + out_of_bounds = "out of bounds" in log_content + + assert ignored_combinations or out_of_bounds, \ + f"Plugin should report unsupported algorithm/protocol combinations as IGNORE or out of bounds. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +def test_singlenode_config(paths): + """Test CSV plugin with single-node configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.SINGLENODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--bind-to", "none", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_singlenode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Single-node CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.SINGLENODE_CONFIG}" + + # Check that configurations were applied for single-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied single-node configurations from {paths.SINGLENODE_CONFIG}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.allreduce +@pytest.mark.multinode +def test_multinode_config(paths): + """Test CSV plugin with multi-node configuration""" + + # Get available nodes using the shared function + nodelist = paths.get_available_nodes() + + # Skip test if no nodes available (SLURM not available) or less than 2 nodes + if len(nodelist) == 0: + pytest.skip("No nodes available") + elif len(nodelist) < 2: + pytest.skip(f"Multinode allreduce test requires at least 2 nodes, but only {len(nodelist)} available: {nodelist}") + + # Check for common network interface across all nodes + common_interface = paths.find_common_interface(nodelist) + if common_interface is None: + pytest.skip(f"Multinode allreduce test requires all nodes to have the same network interface (eth0 or eth1).") + + # Build host specification string (4 processes per node) + host_spec = ",".join([f"{node}:8" for node in nodelist]) + total_processes = len(nodelist) * 8 + print(f"Using host specification: {host_spec}") + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_IGNORE_CPU_AFFINITY": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.MULTINODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + "NCCL_SOCKET_IFNAME": common_interface, + "NCCL_DMABUF_ENABLE": "1", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", f"{total_processes}", + "--host", host_spec, + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/all_reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + allreduce_log_dir = os.path.join(paths.LOGDIR, "allreduce_csv_plugin_test_logs") + os.makedirs(allreduce_log_dir, exist_ok=True) + + log_file = os.path.join(allreduce_log_dir, "test_allreduce_multinode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Multi-node CSV Plugin test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.MULTINODE_CONFIG}" + + # Check that configurations were applied for multi-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied multi-node configurations from {paths.MULTINODE_CONFIG}. Check {log_file} for details" + diff --git a/test/ext-plugins/tests/ext-tuner/test_broadcast.py b/test/ext-plugins/tests/ext-tuner/test_broadcast.py new file mode 100644 index 0000000000..87efcd2d5a --- /dev/null +++ b/test/ext-plugins/tests/ext-tuner/test_broadcast.py @@ -0,0 +1,428 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import subprocess +import pytest + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_valid_config_with_wildcards(paths): + """Test CSV plugin with wildcard values for matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITH_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_valid_config_with_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITH_WILDCARDS}" + + # Check that plugin applied valid configurations (test fails if no configs applied) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied valid configurations, but none were applied. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_valid_config_without_wildcards(paths): + """Test CSV plugin with specific values (no wildcards -1) for precise matching""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITHOUT_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_valid_config_without_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}" + + # With specific values, plugin should either apply matching configs or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + # Test should fail if no config is applied - we expect specific configs to match + assert plugin_applied, \ + f"Plugin should have applied at least one configuration from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_no_matching_config(paths): + """Test CSV plugin behavior with no matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.NO_MATCHING_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_no_matching_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.NO_MATCHING_CONFIG}" + + # Check that NO configurations were applied (they should not match the test environment) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert not plugin_applied, \ + f"Plugin should NOT have applied any configurations from {paths.NO_MATCHING_CONFIG} as they don't match the test environment. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_incorrect_values_config(paths): + """Test CSV plugin behavior with invalid/incorrect values in configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.INCORRECT_VALUES_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_incorrect_values_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded some configurations (plugin should handle invalid values gracefully) + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.INCORRECT_VALUES_CONFIG}" + + # Plugin should still function despite invalid values (using defaults) + # It might apply configs with default values or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + "Plugin should either apply configurations (with defaults) or report no matches" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_unsupported_algo_proto_config(paths): + """Test that plugin handles unsupported algorithm/protocol combinations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.UNSUPPORTED_ALGO_PROTO_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_unsupported_algo_proto.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.UNSUPPORTED_ALGO_PROTO_CONFIG}" + + # Check for unsupported combinations - should see IGNORE or out of bounds messages + ignored_combinations = "Algorithm/protocol combination" in log_content and "is marked as IGNORE" in log_content + out_of_bounds = "out of bounds" in log_content + + assert ignored_combinations or out_of_bounds, \ + f"Plugin should report unsupported algorithm/protocol combinations as IGNORE or out of bounds. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +def test_singlenode_config(paths): + """Test CSV plugin with single-node configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.SINGLENODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_singlenode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Single-node CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.SINGLENODE_CONFIG}" + + # Check that configurations were applied for single-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied single-node configurations from {paths.SINGLENODE_CONFIG}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.broadcast +@pytest.mark.multinode +def test_multinode_config(paths): + """Test CSV plugin with multi-node configuration""" + + # Get available nodes using the shared function + nodelist = paths.get_available_nodes() + + # Skip test if no nodes available (SLURM not available) or less than 2 nodes + if len(nodelist) == 0: + pytest.skip("No nodes available") + elif len(nodelist) < 2: + pytest.skip(f"Multinode broadcast test requires at least 2 nodes, but only {len(nodelist)} available: {nodelist}") + + # Check for common network interface across all nodes + common_interface = paths.find_common_interface(nodelist) + if common_interface is None: + pytest.skip(f"Multinode broadcast test requires all nodes to have the same network interface (eth0 or eth1).") + + # Build host specification string (4 processes per node) + host_spec = ",".join([f"{node}:8" for node in nodelist]) + total_processes = len(nodelist) * 8 + print(f"Using host specification: {host_spec}") + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_IGNORE_CPU_AFFINITY": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.MULTINODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + "NCCL_SOCKET_IFNAME": common_interface, + "NCCL_DMABUF_ENABLE": "1", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", f"{total_processes}", + "--host", host_spec, + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/broadcast_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + broadcast_log_dir = os.path.join(paths.LOGDIR, "broadcast_csv_plugin_test_logs") + os.makedirs(broadcast_log_dir, exist_ok=True) + + log_file = os.path.join(broadcast_log_dir, "test_broadcast_multinode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Multi-node CSV Plugin broadcast test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.MULTINODE_CONFIG}" + + # Check that configurations were applied for multi-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied multi-node configurations from {paths.MULTINODE_CONFIG}. Check {log_file} for details" diff --git a/test/ext-plugins/tests/ext-tuner/test_reduce.py b/test/ext-plugins/tests/ext-tuner/test_reduce.py new file mode 100644 index 0000000000..a03d5bea4a --- /dev/null +++ b/test/ext-plugins/tests/ext-tuner/test_reduce.py @@ -0,0 +1,428 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import subprocess +import pytest + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_valid_config_with_wildcards(paths): + """Test CSV plugin with wildcard values for matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITH_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_valid_config_with_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITH_WILDCARDS}" + + # Check that plugin applied valid configurations (test fails if no configs applied) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied valid configurations, but none were applied. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_valid_config_without_wildcards(paths): + """Test CSV plugin with specific values (no wildcards -1) for precise matching""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITHOUT_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_valid_config_without_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}" + + # With specific values, plugin should either apply matching configs or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + # Test should fail if no config is applied - we expect specific configs to match + assert plugin_applied, \ + f"Plugin should have applied at least one configuration from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_no_matching_config(paths): + """Test CSV plugin behavior with no matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.NO_MATCHING_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_no_matching_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.NO_MATCHING_CONFIG}" + + # Check that NO configurations were applied (they should not match the test environment) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert not plugin_applied, \ + f"Plugin should NOT have applied any configurations from {paths.NO_MATCHING_CONFIG} as they don't match the test environment. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_incorrect_values_config(paths): + """Test CSV plugin behavior with invalid/incorrect values in configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.INCORRECT_VALUES_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_incorrect_values_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded some configurations (plugin should handle invalid values gracefully) + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.INCORRECT_VALUES_CONFIG}" + + # Plugin should still function despite invalid values (using defaults) + # It might apply configs with default values or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + "Plugin should either apply configurations (with defaults) or report no matches" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_unsupported_algo_proto_config(paths): + """Test that plugin handles unsupported algorithm/protocol combinations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.UNSUPPORTED_ALGO_PROTO_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_unsupported_algo_proto.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.UNSUPPORTED_ALGO_PROTO_CONFIG}" + + # Check for unsupported combinations - should see IGNORE or out of bounds messages + ignored_combinations = "Algorithm/protocol combination" in log_content and "is marked as IGNORE" in log_content + out_of_bounds = "out of bounds" in log_content + + assert ignored_combinations or out_of_bounds, \ + f"Plugin should report unsupported algorithm/protocol combinations as IGNORE or out of bounds. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +def test_singlenode_config(paths): + """Test CSV plugin with single-node configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.SINGLENODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_singlenode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Single-node CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.SINGLENODE_CONFIG}" + + # Check that configurations were applied for single-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied single-node configurations from {paths.SINGLENODE_CONFIG}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reduce +@pytest.mark.multinode +def test_multinode_config(paths): + """Test CSV plugin with multi-node configuration""" + + # Get available nodes using the shared function + nodelist = paths.get_available_nodes() + + # Skip test if no nodes available (SLURM not available) or less than 2 nodes + if len(nodelist) == 0: + pytest.skip("No nodes available") + elif len(nodelist) < 2: + pytest.skip(f"Multinode reduce test requires at least 2 nodes, but only {len(nodelist)} available: {nodelist}") + + # Check for common network interface across all nodes + common_interface = paths.find_common_interface(nodelist) + if common_interface is None: + pytest.skip(f"Multinode reduce test requires all nodes to have the same network interface (eth0 or eth1).") + + # Build host specification string (4 processes per node) + host_spec = ",".join([f"{node}:8" for node in nodelist]) + total_processes = len(nodelist) * 8 + print(f"Using host specification: {host_spec}") + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_IGNORE_CPU_AFFINITY": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.MULTINODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + "NCCL_SOCKET_IFNAME": common_interface, + "NCCL_DMABUF_ENABLE": "1", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", f"{total_processes}", + "--host", host_spec, + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_perf", + "-b", "8", # 64 bytes start (faster than 1 byte) + "-e", "128M", # 64MB end (much smaller than 16GB) + "-f", "2", # factor 4 (fewer size steps) + "-g", "1", # gap 1 + ] + + reduce_log_dir = os.path.join(paths.LOGDIR, "reduce_csv_plugin_test_logs") + os.makedirs(reduce_log_dir, exist_ok=True) + + log_file = os.path.join(reduce_log_dir, "test_reduce_multinode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Multi-node CSV Plugin reduce test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.MULTINODE_CONFIG}" + + # Check that configurations were applied for multi-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied multi-node configurations from {paths.MULTINODE_CONFIG}. Check {log_file} for details" diff --git a/test/ext-plugins/tests/ext-tuner/test_reducescatter.py b/test/ext-plugins/tests/ext-tuner/test_reducescatter.py new file mode 100644 index 0000000000..a1d02f894e --- /dev/null +++ b/test/ext-plugins/tests/ext-tuner/test_reducescatter.py @@ -0,0 +1,428 @@ +# ************************************************************************* +# * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# * +# * See LICENSE.txt for license information +# ************************************************************************ + +import os +import subprocess +import pytest + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_valid_config_with_wildcards(paths): + """Test CSV plugin with wildcard values for matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITH_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_valid_config_with_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITH_WILDCARDS}" + + # Check that plugin applied valid configurations (test fails if no configs applied) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied valid configurations, but none were applied. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_valid_config_without_wildcards(paths): + """Test CSV plugin with specific values (no wildcards -1) for precise matching""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.VALID_CONFIG_WITHOUT_WILDCARDS, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_valid_config_without_wildcards.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}" + + # With specific values, plugin should either apply matching configs or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + # Test should fail if no config is applied - we expect specific configs to match + assert plugin_applied, \ + f"Plugin should have applied at least one configuration from {paths.VALID_CONFIG_WITHOUT_WILDCARDS}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_no_matching_config(paths): + """Test CSV plugin behavior with no matching configurations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.NO_MATCHING_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_no_matching_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.NO_MATCHING_CONFIG}" + + # Check that NO configurations were applied (they should not match the test environment) + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert not plugin_applied, \ + f"Plugin should NOT have applied any configurations from {paths.NO_MATCHING_CONFIG} as they don't match the test environment. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_incorrect_values_config(paths): + """Test CSV plugin behavior with invalid/incorrect values in configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.INCORRECT_VALUES_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_incorrect_values_config.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded some configurations (plugin should handle invalid values gracefully) + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.INCORRECT_VALUES_CONFIG}" + + # Plugin should still function despite invalid values (using defaults) + # It might apply configs with default values or report no matches + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + "Plugin should either apply configurations (with defaults) or report no matches" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_unsupported_algo_proto_config(paths): + """Test that plugin handles unsupported algorithm/protocol combinations""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.UNSUPPORTED_ALGO_PROTO_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "4", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_unsupported_algo_proto.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.UNSUPPORTED_ALGO_PROTO_CONFIG}" + + # Check for unsupported combinations - should see IGNORE or out of bounds messages + ignored_combinations = "Algorithm/protocol combination" in log_content and "is marked as IGNORE" in log_content + out_of_bounds = "out of bounds" in log_content + + assert ignored_combinations or out_of_bounds, \ + f"Plugin should report unsupported algorithm/protocol combinations as IGNORE or out of bounds. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +def test_singlenode_config(paths): + """Test CSV plugin with single-node configuration""" + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.SINGLENODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", "8", + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_singlenode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Single-node CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.SINGLENODE_CONFIG}" + + # Check that configurations were applied for single-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied single-node configurations from {paths.SINGLENODE_CONFIG}. Check {log_file} for details" + +@pytest.mark.ext_tuner +@pytest.mark.reducescatter +@pytest.mark.multinode +def test_multinode_config(paths): + """Test CSV plugin with multi-node configuration""" + + # Get available nodes using the shared function + nodelist = paths.get_available_nodes() + + # Skip test if no nodes available (SLURM not available) or less than 2 nodes + if len(nodelist) == 0: + pytest.skip("No nodes available") + elif len(nodelist) < 2: + pytest.skip(f"Multinode reducescatter test requires at least 2 nodes, but only {len(nodelist)} available: {nodelist}") + + # Check for common network interface across all nodes + common_interface = paths.find_common_interface(nodelist) + if common_interface is None: + pytest.skip(f"Multinode reducescatter test requires all nodes to have the same network interface (eth0 or eth1).") + + # Build host specification string (4 processes per node) + host_spec = ",".join([f"{node}:8" for node in nodelist]) + total_processes = len(nodelist) * 8 + print(f"Using host specification: {host_spec}") + + env = os.environ.copy() + env.update({ + "PATH": f"{paths.OMPI_INSTALL_DIR}/bin:{env.get('PATH', '')}", + "LD_LIBRARY_PATH": f"{paths.RCCL_INSTALL_DIR}:{paths.OMPI_INSTALL_DIR}/lib:{paths.PLUGIN_DIR}:{env.get('LD_LIBRARY_PATH', '')}", + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_IGNORE_CPU_AFFINITY": "1", + "NCCL_TUNER_PLUGIN": paths.PLUGIN_SO, + "NCCL_TUNER_CONFIG_FILE": paths.MULTINODE_CONFIG, + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "TUNING", + "NCCL_SOCKET_IFNAME": common_interface, + "NCCL_DMABUF_ENABLE": "1", + }) + + args = [ + f"{paths.OMPI_INSTALL_DIR}/bin/mpirun", "-np", f"{total_processes}", + "--host", host_spec, + "--mca", "pml", "ucx", + "--mca", "btl", "^vader,openib", + f"{paths.RCCL_TESTS_DIR}/build/reduce_scatter_perf", + "-b", "8", + "-e", "128M", + "-f", "2", + "-g", "1", + ] + + reducescatter_log_dir = os.path.join(paths.LOGDIR, "reducescatter_csv_plugin_test_logs") + os.makedirs(reducescatter_log_dir, exist_ok=True) + + log_file = os.path.join(reducescatter_log_dir, "test_reducescatter_multinode.log") + with open(log_file, "w") as logfile: + rccl_test = subprocess.run( + args, + env=env, + stdout=logfile, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + assert rccl_test.returncode == 0, f"Multi-node CSV Plugin reducescatter test failed, see {log_file}" + + # Read and validate log content + with open(log_file, "r") as logfile: + log_content = logfile.read() + + # Check that plugin loaded configurations + assert "TUNER/ExamplePlugin: Loaded" in log_content and "tuning configurations" in log_content, \ + f"Plugin should have loaded configurations from {paths.MULTINODE_CONFIG}" + + # Check that configurations were applied for multi-node setup + plugin_applied = "TUNER/ExamplePlugin: Applied config for collType=" in log_content + + assert plugin_applied, \ + f"Plugin should have applied multi-node configurations from {paths.MULTINODE_CONFIG}. Check {log_file} for details"