From 30d36661c253541955e8f7644b5443e5e9fab7c0 Mon Sep 17 00:00:00 2001 From: Atul Kulkarni Date: Thu, 8 Jan 2026 08:04:41 -0800 Subject: [PATCH] Adds Python-based test runner for RCCL (#2034) * Added python test runner to execute rccl tests * Disabled capture output to avoid hangs * Add RCCL_TEST_MPI_HOSTFILE env var to get the hostfile * Converted test_type to boolean gtest flag * Removed unused return values * Added custom rccl library usage * Removed json output * Updates to test_runner: added num_gpus field * Address review comments * Prepend env vars for single node, single process executions * Added separate enums for exit and result codes * Update configuration files * Moved configurations to its own dir * Address review comments * Update tools/scripts/test_runner/README.md Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com> --------- Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com> [ROCm/rccl commit: 0c2c61d2f1c112b1eede47f119dcf5593ff0db2c] --- .../rccl/tools/scripts/test_runner/README.md | 984 ++++++++++++++++++ .../configs/mi300x_mellanox_ib.json | 506 +++++++++ .../test_runner/configs/rccl_perf_tests.json | 458 ++++++++ .../configs/test_config_sample.json | 126 +++ .../tools/scripts/test_runner/lib/__init__.py | 20 + .../scripts/test_runner/lib/test_config.py | 401 +++++++ .../scripts/test_runner/lib/test_executor.py | 858 +++++++++++++++ .../scripts/test_runner/lib/test_parser.py | 167 +++ .../tools/scripts/test_runner/test_runner.py | 124 +++ 9 files changed, 3644 insertions(+) create mode 100644 projects/rccl/tools/scripts/test_runner/README.md create mode 100644 projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json create mode 100644 projects/rccl/tools/scripts/test_runner/configs/rccl_perf_tests.json create mode 100644 projects/rccl/tools/scripts/test_runner/configs/test_config_sample.json create mode 100644 projects/rccl/tools/scripts/test_runner/lib/__init__.py create mode 100644 projects/rccl/tools/scripts/test_runner/lib/test_config.py create mode 100644 projects/rccl/tools/scripts/test_runner/lib/test_executor.py create mode 100644 projects/rccl/tools/scripts/test_runner/lib/test_parser.py create mode 100755 projects/rccl/tools/scripts/test_runner/test_runner.py diff --git a/projects/rccl/tools/scripts/test_runner/README.md b/projects/rccl/tools/scripts/test_runner/README.md new file mode 100644 index 0000000000..7d256a6ebe --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/README.md @@ -0,0 +1,984 @@ +# RCCL Test Runner + +A Python-based test runner focused on RCCL unit and functional tests with hierarchical configuration support and integrated code coverage reporting. Extensible to support performance benchmarks, MPI tests, and custom test scripts. + +## Overview + +This test runner provides a maintainable, extensible alternative to shell-based test execution. It uses JSON configuration files with hierarchical inheritance, and integrates with LLVM code coverage tools. + +## Key Features + +- **Multiple Test Types**: Support for GTest, performance tests, and custom executables +- **Hierarchical Configuration**: Use `"extends"` directive to inherit and merge configurations +- **Environment Variable Management**: Global, configuration, suite, and test-specific environment variables +- **Path Variable Expansion**: Use environment variables in paths with nested default value expansion +- **Custom Library Support**: Use pre-built RCCL libraries from custom locations via environment variables +- **Configurable Build System**: Customize CMake options, environment variables, and parallel jobs via config +- **MPI Support**: Full support for multi-rank and multi-node tests +- **Flexible Test Filtering**: Run all tests, specific test suites, or individual tests +- **Build Integration**: Automated RCCL building with CMake +- **Code Coverage**: Integrated LLVM coverage report generation (HTML and text) +- **Clean Output**: Automatic filtering of MPI verbose messages (enable with --verbose) +- **Verbose Logging**: Detailed output for debugging and troubleshooting + +## Quick Start + +### Basic Usage + +```bash +# Run with specific configuration +python test_runner.py --config my_tests.json + +# Run with verbose output +python test_runner.py --config my_tests.json --verbose + +# Run specific test by name +python test_runner.py --config my_tests.json --test-name SHM_ComprehensiveWorkflow +``` + +### Generate Coverage Report + +```bash +# Build, run tests, and generate coverage report +python test_runner.py --config test_config_sample.json --coverage-report --verbose + +# Use existing build and generate coverage +python test_runner.py --config test_config_sample.json --no-build --coverage-report +``` + +### Use Custom RCCL Library + +```bash +# Use pre-built RCCL library from custom location +export RCCL_LIB_PATH=/path/to/custom/rccl/build +python test_runner.py --config test_config_sample.json + +# Or use RCCL_BUILD_DIR (alternative name) +export RCCL_BUILD_DIR=/path/to/custom/rccl/build +python test_runner.py --config test_config_sample.json + +# When set, build step is automatically skipped +# --no-build is not needed +``` + +## Environment Variables + +The test runner supports the following environment variables to customize behavior: + +### Library and Build Configuration + +| Variable | Description | Example | +|----------|-------------|---------| +| `RCCL_LIB_PATH` | Path to pre-built RCCL library directory (contains `librccl.so` and `test/` subdirectory). When set, the build step is automatically skipped. | `/path/to/rccl/build` | +| `RCCL_BUILD_DIR` | Alternative name for `RCCL_LIB_PATH`. Either variable can be used. | `/path/to/rccl/build` | +| `RCCL_TEST_MPI_HOSTFILE` | Path to MPI hostfile for multi-node tests. | `~/.mpi_hostfile` | + +### Configuration Path Variables + +These can be overridden via environment variables or specified in the JSON config: + +| Variable | Description | Default | +|----------|-------------|---------| +| `WORKDIR` | RCCL source and build directory | Current rccl repository root | +| `ROCM_PATH` | ROCm installation path | `/opt/rocm` | +| `MPI_PATH` | MPI installation path | System default or config-specific | + +### Priority Order + +When determining which RCCL library to use, the test runner follows this priority: + +1. **`RCCL_LIB_PATH` or `RCCL_BUILD_DIR` environment variable** (highest priority) + - Skips build automatically + - Must contain `librccl.so` and `test/` subdirectory +2. **`--no-build` flag with local build** + - Uses local `build_debug_cov_on_tests_on/` directory + - Requires prior build +3. **Default build process** (lowest priority) + - Builds RCCL in timestamped directory + - Uses CMake configuration from JSON + +**Example Usage:** + +```bash +# Priority 1: Use custom library (build skipped automatically) +export RCCL_LIB_PATH=/path/to/prebuilt/rccl/build +python test_runner.py --config my_tests.json + +# Priority 2: Use existing local build (no new build) +python test_runner.py --config my_tests.json --no-build + +# Priority 3: Fresh build (default) +python test_runner.py --config my_tests.json +``` + +## Configuration File Format + +### Basic Structure + +```json +{ + "system_configurations": { + "name": "system-name", + "description": "System description" + }, + "paths": { + "workdir": "/path/to/rccl", + "rocm_path": "/opt/rocm", + "mpi_path": "/path/to/mpi" + }, + "env_variables": { + "GLOBAL_VAR": "value" + }, + "test_configurations": { + "config_name": { + "env_variables": {...}, + "tests": [...] + } + }, + "test_suites": [ + { + "name": "Test Suite Name", + "config": "config_name", + "enabled": true + } + ] +} +``` + +### Environment Variable Expansion in Paths + +The `paths` section supports environment variable expansion, allowing you to avoid hardcoding paths and make configurations portable across different systems. + +#### Supported Syntax + +```json +{ + "paths": { + "workdir": "${HOME}/code/rccl", + "rocm_path": "$ROCM_PATH", + "mpi_path": "${MPI_PATH:-/opt/mpi}" + } +} +``` + +**Syntax Options:** +- `${VAR}` - Expands to the value of `VAR`, left as-is if undefined +- `$VAR` - Expands to the value of `VAR`, left as-is if undefined +- `${VAR:-default}` - Expands to the value of `VAR`, or `default` if undefined (bash-style default) + +#### Examples + +```json +{ + "paths": { + "workdir": "${WORKDIR:-${HOME}/code/rti/scripts/rccl}", + "rocm_path": "${ROCM_PATH:-/opt/rocm}", + "mpi_path": "${MPI_PATH:-${HOME}/softwares/ompi}" + } +} +``` + +**Usage:** +```bash +# Use environment variables +export WORKDIR=/custom/path/to/rccl +export ROCM_PATH=/opt/rocm-6.0 +export MPI_PATH=/usr/local/mpi + +python test_runner.py --config test_config_sample.json + +# Or use defaults (no environment variables set) +python test_runner.py --config test_config_sample.json +``` + +**Benefits:** +- **Portability**: Share configurations across different systems +- **Flexibility**: Override paths without modifying config files +- **CI/CD**: Easy integration with build systems and pipelines +- **Multi-user**: Same config works for different user environments + +### Test Types Supported + +The test runner uses the `is_gtest` boolean flag to distinguish between test types: + +- **`is_gtest: true`** (default) - GTest-based unit tests using `--gtest_filter` syntax +- **`is_gtest: false`** - Non-GTest tests (performance benchmarks, custom scripts, etc.) + +This simplified approach supports all test categories while reducing configuration complexity. + +#### GTest Tests (`is_gtest: true`) + +Used for unit tests with GTest framework. The `test_filter` field uses GTest filter syntax. + +```json +{ + "name": "AllReduce_InPlace", + "description": "Test AllReduce collective operation with in-place buffers", + "is_gtest": true, + "binary": "rccl-UnitTests", + "test_filter": "AllReduce.InPlace", + "num_ranks": 1, + "num_nodes": 1, + "timeout": 60 +} +``` + +**Command generated:** +```bash +./rccl-UnitTests --gtest_filter=AllReduce.InPlace +``` + +#### Performance Tests (`is_gtest: false`) + +Used for performance benchmarks. Arguments are passed directly without GTest syntax. + +```json +{ + "name": "Perf_Bandwidth", + "description": "Bandwidth benchmark for AllReduce", + "is_gtest": false, + "binary": "all_reduce_perf", + "command_args": "-b 8 -e 128M -f 2", + "num_ranks": 2, + "num_nodes": 1, + "timeout": 300 +} +``` + +**Command generated:** +```bash +mpirun -np 2 ./all_reduce_perf -b 8 -e 128M -f 2 +``` + +#### Custom Scripts (`is_gtest: false`) + +Used for custom validation scripts or any non-GTest executables. + +```json +{ + "name": "Custom_Validation", + "description": "Custom GPU validation script", + "is_gtest": false, + "binary": "validate_gpus.sh", + "command_args": "--full-check --verbose", + "num_ranks": 1, + "num_nodes": 1, + "timeout": 120 +} +``` + +**Command generated:** +```bash +./validate_gpus.sh --full-check --verbose +``` + +**Key Differences:** + +| Feature | `is_gtest: true` | `is_gtest: false` | +|---------|------------------|-------------------| +| Test framework | GTest (Google Test) | Any executable | +| Filter syntax | `--gtest_filter=` | Plain arguments | +| `test_filter` field | GTest pattern (e.g., `Suite.Test*`) | Passed as plain argument | +| `command_args` field | Appended after filter | Primary argument method | +| Typical use cases | Unit tests, functional tests | Performance tests, custom scripts | + +### Test Definition Fields + +| Field | Required | Type | Description | +|-------|----------|------|-------------| +| `name` | Yes | string | Unique test identifier | +| `description` | Recommended | string | Human-readable test description | +| `is_gtest` | Optional | boolean | Whether test uses GTest framework (default: true). Set to false for perf or custom tests | +| `binary` | Yes | string | Test binary name (relative to build/test/) | +| `test_filter` | Optional | string | Test filter (GTest filter syntax for gtest, plain argument for non-gtest) | +| `command_args` | Optional | string | Additional command-line arguments | +| `num_ranks` | Optional | integer | Number of MPI ranks (default: 1) | +| `num_nodes` | Optional | integer | Number of nodes (default: 1) | +| `num_gpus` | Optional | integer | GPUs per node - controls rank distribution (default: 8) | +| `timeout` | Optional | integer | Timeout in seconds (0 = unlimited) | +| `env_variables` | Optional | object | Test-specific environment variables | + +### Configuration Inheritance + +Use the `"extends"` directive to inherit from parent configurations: + +```json +{ + "test_configurations": { + "base": { + "env_variables": { + "NCCL_DEBUG": "INFO" + } + }, + "shm_tests": { + "extends": "base", + "env_variables": { + "NCCL_SHM_DISABLE": "0" + }, + "tests": [...] + }, + "advanced_shm": { + "extends": ["base", "shm_tests"], + "env_variables": { + "NCCL_SHM_USE_CUDA_MEMCPY": "1" + } + } + } +} +``` + +### Hierarchical Defaults + +To reduce repetition, you can specify default values at multiple levels with a clear override hierarchy: + +**Priority Order (highest to lowest):** +1. **Individual test** - highest priority, overrides everything +2. **Test suite level** - overrides configuration defaults +3. **Configuration level** - base defaults for all tests in that config +4. **Built-in defaults** - system fallback values + +**Supported default fields:** `is_gtest`, `binary`, `num_ranks`, `num_nodes`, `num_gpus`, `timeout` + +#### Example with Three-Level Hierarchy + +```json +{ + "test_configurations": { + "p2p_tests": { + "is_gtest": true, + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 1, + "num_gpus": 2, + "timeout": 120, + "env_variables": { + "NCCL_P2P_DISABLE": "0" + }, + "tests": [ + { + "name": "P2P_Basic", + "description": "Basic P2P test", + "test_filter": "P2pMPITest.Basic" + // Uses config defaults: is_gtest=true, binary, num_ranks=2, num_nodes=1, num_gpus=2, timeout=120 + }, + { + "name": "P2P_LongRunning", + "description": "Long-running P2P test", + "test_filter": "P2pMPITest.LongRunning", + "timeout": 300 + // Overrides timeout=300, inherits other config defaults + } + ] + } + }, + "test_suites": [ + { + "name": "P2P_Basic_Suite", + "config": "p2p_tests", + "num_ranks": 4, + "num_gpus": 4, + "timeout": 180 + // Suite-level: overrides config's num_ranks, num_gpus, and timeout + // Tests in this suite will use: num_ranks=4, num_gpus=4, timeout=180 + }, + { + "name": "P2P_Stress_Suite", + "config": "p2p_tests", + "num_nodes": 2, + "num_ranks": 4, + "num_gpus": 2, + "timeout": 600 + // Suite-level: overrides config's num_nodes, num_ranks, num_gpus, and timeout + // Tests in this suite will use: num_nodes=2, num_ranks=4, num_gpus=2, timeout=600 + } + ] +} +``` + +**Benefits:** +- **Less Repetition**: Define common values once +- **Easier Maintenance**: Update defaults in one place +- **Flexible Overrides**: Tests can still customize any field +- **Cleaner Config**: Shorter, more readable test definitions + +## Command-Line Options + +``` +Required: + -c, --config CONFIG Test configuration file (JSON format) + +Optional: + -v, --verbose Enable verbose output (shows build paths, commands, etc.) + -o, --output DIR Output directory for logs and reports + --test-name NAME Run only specific test by name + --no-build Skip build step and use existing build + --skip-tests Skip test execution (useful with --coverage-report) + --coverage-report Generate code coverage report (HTML + text) + --overwrite Overwrite previous workspace directories + --report-suffix SUFFIX Suffix for report directory (default: blank) + -h, --help Show help message and exit +``` + +## Code Coverage Reports + +The test runner integrates with LLVM tools to generate comprehensive code coverage reports. + +### Generating Coverage + +```bash +# Build and test with coverage (recommended) +python test_runner.py --config test_config_sample.json --coverage-report --verbose + +# Generate report from existing profraw files +python test_runner.py --config test_config_sample.json --no-build --skip-tests --coverage-report +``` + +### Coverage Output + +When `--coverage-report` is specified, the runner generates: + +1. **HTML Report**: Visual coverage report in `reports/` directory + - View with: `firefox reports/index.html` + - Shows line-by-line coverage with syntax highlighting + +2. **Text Report**: Function-level coverage summary + - Location: `reports/function_coverage_report.txt` + - Includes per-function and per-file statistics + +### Coverage Implementation Details + +- Uses LLVM instrumentation (`-fprofile-instr-generate -fcoverage-mapping`) +- Collects `.profraw` files during test execution +- Merges profiles with `llvm-profdata` +- Generates reports with `llvm-cov show` and `llvm-cov report` +- Filters out irrelevant files (test/, gtest, external dependencies) + +## Examples + +### Run All Enabled Test Suites + +```bash +python test_runner.py --config test_config_sample.json --verbose +``` + +### Run Specific Test + +```bash +python test_runner.py --config test_config_sample.json --test-name P2P_AllTests +``` + +### Skip Build (Use Existing) + +```bash +python test_runner.py --config test_config_sample.json --no-build +``` + +### Build and Generate Coverage + +```bash +# Full workflow: build, test, coverage +python test_runner.py --config adhoc_test_config.json --coverage-report --verbose +``` + +### Generate Coverage from Existing Build + +```bash +# Skip build, use existing profraw files +python test_runner.py --config adhoc_test_config.json --no-build --skip-tests --coverage-report +``` + +### Custom Output Directory + +```bash +python test_runner.py --config test_config_sample.json -o /path/to/output --verbose +``` + +### Run with Overwrite (Clean Previous Results) + +```bash +python test_runner.py --config test_config_sample.json --overwrite --coverage-report +``` + +## Environment Variable Merging + +Environment variables are merged hierarchically (later values override earlier): + +1. **Global** `env_variables` (top-level in config) +2. **Configuration** `env_variables` (test configuration level) +3. **Test Suite** `env_variables` (suite level) +4. **Test-specific** `env_variables` (individual test level) + +Example: +```json +{ + "env_variables": { + "NCCL_DEBUG": "INFO" + }, + "test_configurations": { + "shm_tests": { + "env_variables": { + "NCCL_SHM_DISABLE": "0" + }, + "tests": [ + { + "name": "SHM_Test", + "env_variables": { + "NCCL_DEBUG": "TRACE" + } + } + ] + } + } +} +``` + +Result: `NCCL_DEBUG=TRACE`, `NCCL_SHM_DISABLE=0` + +## Test Execution + +### Single-Node Tests + +- All ranks run on a single node +- Multiple ranks map to different GPUs +- Examples: SHM tests, P2P tests, unit tests + +```json +{ + "name": "SHM_Test", + "num_ranks": 2, + "num_nodes": 1 +} +``` + +### Multi-Node Tests + +- Ranks distributed across multiple nodes via MPI +- Requires SLURM allocation or hostfile configuration +- Use `num_gpus` to control ranks per node (default: 8) +- Examples: NET transport tests, InfiniBand tests + +```json +{ + "name": "NET_Test_4Nodes_2GPUs", + "num_ranks": 8, + "num_nodes": 4, + "num_gpus": 2 +} +``` + +**`num_gpus` Field:** +- Controls how many MPI ranks are placed on each node +- Overrides hostfile `slots` specification +- For multi-node tests, uses `--map-by ppr:{num_gpus}:node` +- Default value: 8 (matches typical 8-GPU nodes) + +**Example: 2 nodes, 1 GPU per node** +```json +{ + "name": "NET_Test_2Nodes_1GPU", + "num_ranks": 2, + "num_nodes": 2, + "num_gpus": 1 +} +``` +Command: `mpirun -np 2 --hostfile file --map-by ppr:1:node ...` + +### Setting Up Multi-Node Tests + +**Option 1: MPI Hostfile** +```bash +export RCCL_TEST_MPI_HOSTFILE=/path/to/hostfile +python test_runner.py --config net_ib_test_config.json +``` + +**Option 2: Default Hostfile** +Create `~/.mpi_hostfile` with node names (one per line): +``` +node01 slots=8 +node02 slots=8 +``` + +## Advanced Features + +### Build Configuration (New!) + +Customize the RCCL build process through the `build_configuration` section in your JSON config file. + +#### Basic Structure + +```json +{ + "build_configuration": { + "cmake_options": { + "CMAKE_BUILD_TYPE": "Debug", + "ENABLE_CODE_COVERAGE": "ON", + "ONLY_FUNCS": "SendRecv|AllReduce" + }, + "env_variables": { + "HIPCC_COMPILE_FLAGS_APPEND": "-g -O1" + }, + "parallel_jobs": 64, + "generator": "Unix Makefiles" + } +} +``` + +#### Examples + +**Fast Development Build (No Coverage):** +```json +{ + "build_configuration": { + "cmake_options": { + "ENABLE_CODE_COVERAGE": "OFF" + }, + "parallel_jobs": 128 + } +} +``` + +**Release Build:** +```json +{ + "build_configuration": { + "cmake_options": { + "CMAKE_BUILD_TYPE": "Release", + "TRACE": "OFF", + "COLLTRACE": "OFF" + } + } +} +``` + +**Test Specific Functions Only:** +```json +{ + "build_configuration": { + "cmake_options": { + "ONLY_FUNCS": "Broadcast|Reduce" + } + } +} +``` + +**All Options:** +- `cmake_options` - Any CMake option (user values override defaults) +- `env_variables` - Build environment variables +- `parallel_jobs` - Number of parallel build threads (default: 64) +- `generator` - CMake generator: "Unix Makefiles", "Ninja", etc. + +See `BUILD_CONFIGURATION_GUIDE.md` for complete documentation. + +### Enhanced Environment Variable Expansion + +Environment variables in the `paths` section now support **nested expansion** in default values: + +```json +{ + "paths": { + "workdir": "${WORKDIR:-$HOME/code/rti/scripts/rccl}", + "rocm_path": "${ROCM_PATH:-/opt/rocm}", + "mpi_path": "${MPI_PATH:-$HOME/softwares/ompi}" + } +} +``` + +**Key Feature:** If `WORKDIR` is not set, the default `$HOME/code/rti/scripts/rccl` will expand `$HOME` automatically! + +### Flexible Binary Paths + +Specify test binary locations in multiple ways for maximum flexibility: + +#### 1. Default (Relative to build_dir/test/) + +```json +{ + "binary": "all_reduce_perf" +} +``` +Result: `/build_debug_cov_on_tests_on/test/all_reduce_perf` + +#### 2. Absolute Path + +```json +{ + "binary": "/opt/custom_rccl_build/test/all_reduce_perf" +} +``` +Result: Uses the absolute path directly + +#### 3. Environment Variable in Binary Name + +```json +{ + "binary": "${MY_RCCL_TESTS}/all_reduce_perf" +} +``` +Result: Expands `$MY_RCCL_TESTS` environment variable + +#### 4. Home Directory Expansion + +```json +{ + "binary": "~/my_builds/rccl/test/all_reduce_perf" +} +``` +Result: Expands `~` to home directory + +#### 5. Using test_binary_dir in Paths + +```json +{ + "paths": { + "test_binary_dir": "${RCCL_TEST_BIN_DIR}" + }, + "test_configurations": { + "my_tests": { + "binary": "all_reduce_perf" + } + } +} +``` +Result: `${RCCL_TEST_BIN_DIR}/all_reduce_perf` + +#### 6. Using test_binary_dir in Test Config + +```json +{ + "test_configurations": { + "my_tests": { + "tests": [ + { + "name": "CustomBinary", + "test_binary_dir": "/opt/rccl/tests", + "binary": "all_reduce_perf" + } + ] + } + } +} +``` +Result: `/opt/rccl/tests/all_reduce_perf` + +#### Resolution Priority Order + +1. **Absolute path in binary** - Highest priority +2. **Environment variable expansion** (if results in absolute path) +3. **test_binary_dir in test config** + binary +4. **test_binary_dir in paths** + binary +5. **Default:** `build_dir/test/` + binary - Lowest priority + +#### Use Cases + +- **CI/CD with pre-built binaries:** Use absolute paths or `RCCL_TEST_BIN_DIR` +- **Multiple RCCL versions:** Different `test_binary_dir` per configuration +- **Custom build locations:** Environment variables for flexibility +- **Standard builds:** Use default (no configuration needed) + +#### Verbose Mode + +Use `--verbose` to see the resolved binary path: +```bash +python test_runner.py --config test.json --verbose +``` + +Output includes: +``` +Binary: all_reduce_perf +Binary path: /home/user/code/rti/scripts/rccl/build_debug_cov_on_tests_on/test/all_reduce_perf +``` + +### Configuration Best Practices + +**Reduce Repetition:** Move common values to configuration level + +```json +{ + "test_configurations": { + "p2p_tests": { + "timeout": 120, + "env_variables": { + "NCCL_P2P_USE_CUDA_MEMCPY": "1", + "NCCL_LEGACY_CUDA_REGISTER": "1" + }, + "tests": [ + { + "name": "Test1" + // Inherits timeout and env vars from config level + }, + { + "name": "Test2", + "timeout": 300 + // Overrides timeout, inherits env vars + } + ] + } + } +} +``` + +**Benefits:** +- ✅ Single source of truth for common settings +- ✅ Easier maintenance +- ✅ Tests can still override when needed +- ✅ Cleaner, more readable configurations + +## Development and Testing + +### Validate Configuration + +```bash +# Test JSON syntax +python3 -m json.tool test_config_sample.json + +# Test configuration loading +python3 -c "from lib.test_config import TestConfigProcessor; \ + p = TestConfigProcessor('test_config_sample.json'); \ + print('Configuration valid!')" + +# Dry run (validate without executing) +python test_runner.py --config test_config_sample.json --skip-tests --verbose +``` + +### Adding New Tests + +1. Add test definition to appropriate configuration in JSON file +2. Specify `is_gtest`, `description`, and required fields +3. Test with dry run first: `--skip-tests --verbose` +4. Run actual test: `--test-name YourTest --verbose` + +### Test Type Handling + +The test runner uses a boolean `is_gtest` flag to distinguish between test types: + +- **`is_gtest: true`** (default): Uses GTest framework with `--gtest_filter=` syntax +- **`is_gtest: false`**: Runs binary with plain arguments (for performance tests, custom scripts, etc.) + +This simplified approach eliminates the need for multiple test type conditionals while supporting all test categories (gtest, perf, custom). + +## Troubleshooting + +### "Configuration file not found" +- Check the path to your JSON config file +- Use absolute paths or ensure you're in the correct directory +- Verify file permissions + +### "MPI path not found" +- Update `paths.mpi_path` in your configuration +- Ensure MPI is installed: `which mpirun` +- Check MPI_PATH environment variable + +### "Test binary not found" +- Build first: remove `--no-build` flag +- Check binary name in `build/test/` directory +- Verify CMAKE built successfully + +### Multi-node tests hang +- Ensure SLURM allocation or hostfile is configured +- Check network connectivity: `ping other_node` +- Verify MPI can reach nodes: `mpirun -np 2 hostname` +- Check firewall settings + +### CMake configuration fails +- Check ROCm path: `ls $ROCM_PATH` +- Verify compiler: `$ROCM_PATH/bin/amdclang++ --version` +- Check MPI path: `ls $MPI_PATH/bin/mpirun` + +### Coverage report fails +- Ensure LLVM tools are available: `which llvm-profdata llvm-cov` +- Check for `.profraw` files in build directory +- Verify coverage build flags were set correctly +- Run with `--verbose` to see detailed error messages + +### "LLVM_PROFILE_FILE not being used" +- Ensure `--coverage-report` flag is specified +- Check that tests are actually executing (not skipped) +- Verify environment variables with `--verbose` + +--- + +## Appendix: Environment Variables Reference + +This section provides a quick reference for all environment variables supported by the test runner. + +### Library and Build Location + +| Variable | Description | Example | +|----------|-------------|---------| +| `RCCL_LIB_PATH` | Path to pre-built RCCL library directory. Automatically skips build. | `export RCCL_LIB_PATH=/path/to/rccl/build` | +| `RCCL_BUILD_DIR` | Alternative name for `RCCL_LIB_PATH`. | `export RCCL_BUILD_DIR=/home/user/rccl_builds/debug` | + +**Requirements**: Directory must contain `librccl.so` and `test/` subdirectory. + +### Configuration Paths + +These override the paths specified in the JSON configuration file: + +| Variable | Description | Example | +|----------|-------------|---------| +| `WORKDIR` | RCCL source and build directory | `export WORKDIR=/home/user/code/rccl` | +| `ROCM_PATH` | ROCm installation path | `export ROCM_PATH=/opt/rocm-6.0` | +| `MPI_PATH` | MPI installation path | `export MPI_PATH=/usr/local/openmpi` | + +### Test Execution + +| Variable | Description | Example | +|----------|-------------|---------| +| `RCCL_TEST_MPI_HOSTFILE` | Path to MPI hostfile for multi-node tests | `export RCCL_TEST_MPI_HOSTFILE=~/.mpi_hostfile` | + +**Note**: Falls back to `~/.mpi_hostfile` if not set. For SLURM environments, hostfile is auto-generated from `SLURM_NODELIST`. + +### Test-Specific Variables + +These can be set globally or specified in the JSON configuration per test: + +| Variable | Description | Example | +|----------|-------------|---------| +| `NCCL_DEBUG` | NCCL debug level (VERSION, WARN, INFO, TRACE) | `export NCCL_DEBUG=INFO` | +| `NCCL_DEBUG_SUBSYS` | NCCL debug subsystems to enable | `export NCCL_DEBUG_SUBSYS=INIT,COLL,NET` | +| `HSA_NO_SCRATCH_RECLAIM` | Disable HIP scratch memory reclaim | `export HSA_NO_SCRATCH_RECLAIM=1` | +| `NCCL_LAUNCH_MODE` | NCCL launch mode (GROUP, PARALLEL) | `export NCCL_LAUNCH_MODE=GROUP` | + +### Coverage and Profiling + +| Variable | Description | Example | +|----------|-------------|---------| +| `LLVM_PROFILE_FILE` | LLVM coverage profile output pattern | `export LLVM_PROFILE_FILE=rccl_%p_%m.profraw` | + +**Note**: Automatically set by test runner to prevent collisions. Manual override not recommended. + +### Complete Example + +```bash +#!/bin/bash +# Configure paths +export WORKDIR=/home/user/code/rccl +export ROCM_PATH=/opt/rocm-6.0 +export MPI_PATH=/usr/local/openmpi + +# Use pre-built library +export RCCL_LIB_PATH=/home/user/rccl_builds/instrumented + +# Configure MPI +export RCCL_TEST_MPI_HOSTFILE=~/.mpi_hostfile + +# Enable debug output +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=INIT,COLL,NET + +# Run tests +python test_runner.py --config my_tests.json --verbose +``` + +### Variable Priority + +When the same configuration can be specified in multiple places, the priority is: + +1. **Environment variables** (highest priority) +2. **Test-specific configuration** (in JSON) +3. **Test suite configuration** (in JSON) +4. **Test configuration defaults** (in JSON) +5. **Built-in defaults** (lowest priority) + +**Example**: If `ROCM_PATH` is set as an environment variable, it overrides the `rocm_path` value in the JSON configuration file. + diff --git a/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json new file mode 100644 index 0000000000..33041689e8 --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json @@ -0,0 +1,506 @@ +{ + "system_configurations": { + "name": "RCCL-Tests-MI300X-Mellanox-IB", + "description": "Comprehensive RCCL Test Configuration - All Tests" + }, + "paths": { + "workdir": "${WORKDIR:-$PWD}", + "rocm_path": "${ROCM_PATH:-/opt/rocm}", + "mpi_path": "${MPI_PATH:-/opt/ompi}" + }, + "env_variables": { + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_SOCKET_IFNAME": "eth0,eth1", + "NCCL_DEBUG": "INFO" + }, + "build_configuration": { + "cmake_options": { + "CMAKE_BUILD_TYPE": "Debug", + "ENABLE_CODE_COVERAGE": "ON", + "BUILD_TESTS": "ON", + "BUILD_LOCAL_GPU_TARGET_ONLY": "ON", + "TRACE": "ON", + "COLLTRACE": "ON" + }, + "env_variables": { + "HIPCC_COMPILE_FLAGS_APPEND": "-g -Wno-format-nonliteral -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping -parallel-jobs=16" + }, + "parallel_jobs": 64, + "generator": "Unix Makefiles" + }, + "test_configurations": { + "default": { + "env_variables": { + "NCCL_LAUNCH_MODE": "GROUP" + } + }, + "shm_comprehensive": { + "extends": "default", + "is_gtest": true, + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 1, + "num_gpus": 2, + "timeout": 120, + "env_variables": { + "NCCL_SHM_DISABLE": "0", + "NCCL_SHM_USE_CUDA_MEMCPY": "1" + }, + "tests": [ + { + "name": "SHM_ComprehensiveWorkflow", + "description": "Comprehensive workflow test for shared memory transport", + "test_filter": "ShmMPITest.ShmWorkflow" + }, + { + "name": "SHM_CEMemcpy_SendSide", + "description": "Shared memory test with compute engine memcpy on send side", + "test_filter": "ShmMPITest.ShmWithMemcpyTest", + "timeout": 180, + "env_variables": { + "NCCL_SHM_MEMCPY_MODE": "1", + "NCCL_SHM_LOCALITY": "1" + } + }, + { + "name": "SHM_CEMemcpy_RecvSide", + "description": "Shared memory test with compute engine memcpy on receive side", + "test_filter": "ShmMPITest.ShmWithMemcpyTest", + "env_variables": { + "NCCL_SHM_MEMCPY_MODE": "2", + "NCCL_SHM_LOCALITY": "2" + } + }, + { + "name": "SHM_CEMemcpy_BothSides", + "description": "Shared memory test with compute engine memcpy on both send and receive sides using simple protocol", + "test_filter": "ShmMPITest.ShmWithMemcpyTest", + "env_variables": { + "NCCL_PROTO": "SIMPLE", + "NCCL_SHM_MEMCPY_MODE": "3", + "NCCL_SHM_LOCALITY": "1" + } + }, + { + "name": "SHM_AllTests", + "description": "All shared memory transport tests", + "test_filter": "ShmMPITest.*" + } + ] + }, + "p2p_comprehensive": { + "extends": "default", + "is_gtest": true, + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 1, + "num_gpus": 2, + "timeout": 120, + "env_variables": { + "NCCL_P2P_USE_CUDA_MEMCPY": "1", + "NCCL_LEGACY_CUDA_REGISTER": "1" + }, + "tests": [ + { + "name": "P2P_Workflow", + "description": "Peer-to-peer transport workflow test between two GPUs", + "test_filter": "P2pMPITest.P2pWorkflow", + "env_variables": { + "NCCL_P2P_DISABLE": "0" + } + }, + { + "name": "P2P_WithMemcpy", + "description": "Peer-to-peer test with CUDA memcpy and legacy buffer registration", + "test_filter": "P2pMPITest.P2pWithMemcpyTest" + }, + { + "name": "P2P_SendRecvRegistration", + "description": "Test peer-to-peer send/receive buffer registration mechanisms", + "test_filter": "P2pMPITest.P2pSendRecvRegistrationTest" + }, + { + "name": "P2P_IpcReg_VerySmallBuffer", + "description": "Test P2P IPC buffer registration with very small buffer sizes and SHM disabled", + "test_filter": "P2pMPITest.P2pIpcBufferRegistration_VerySmallBuffer", + "env_variables": { + "NCCL_SHM_DISABLE": "1", + "NCCL_LOCAL_REGISTER": "1" + } + }, + { + "name": "P2P_AllTests", + "description": "All peer-to-peer transport tests", + "test_filter": "P2pMPITest.*" + } + ] + }, + "net_transport_eth_multinode": { + "extends": "default", + "is_gtest": true, + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 2, + "num_gpus": 1, + "env_variables": { + "NCCL_NET_SHARED_COMMS": "1", + "NCCL_NET_SHARED_BUFFERS": "1", + "NCCL_IB_DISABLE": "0" + }, + "tests": [ + { + "name": "NET_AllTests_2Nodes_ETH", + "description": "All network transport tests over Ethernet across two nodes", + "test_filter": "NetTransportMPITest.*", + "timeout": 600 + }, + { + "name": "NET_MultipleBufferSizes_2Nodes", + "description": "Network transport test with multiple buffer sizes across two nodes", + "test_filter": "NetTransportMPITest.MultipleBufferSizesTest", + "timeout": 180 + }, + { + "name": "NET_NetGraphRegister_2Nodes", + "description": "Network transport test with graph buffer registration across two nodes", + "test_filter": "NetTransportMPITest.NetGraphRegisterBufferTest", + "timeout": 120, + "env_variables": { + "NCCL_GRAPH_REGISTER": "1" + } + } + ] + }, + "net_ib_base": { + "extends": "default", + "is_gtest": true, + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 1, + "num_gpus": 2, + "env_variables": { + "NCCL_DMABUF_ENABLE": "1" + } + }, + "net_ib_initialization": { + "extends": "net_ib_base", + "timeout": 60, + "tests": [ + { + "name": "NetIB_Init_Plugin", + "description": "Initialize InfiniBand network plugin and verify basic setup", + "test_filter": "NetIbMPITest.InitializePlugin" + }, + { + "name": "NetIB_Init_GetDeviceCount", + "description": "Query and validate InfiniBand device count", + "test_filter": "NetIbMPITest.GetDeviceCount" + } + ] + }, + "net_ib_properties": { + "extends": "net_ib_base", + "timeout": 60, + "tests": [ + { + "name": "NetIB_Props_GetProperties", + "description": "Query InfiniBand device properties and capabilities", + "test_filter": "NetIbMPITest.GetDeviceProperties" + }, + { + "name": "NetIB_Props_InvalidDevice", + "description": "Test error handling when querying properties of invalid InfiniBand device", + "test_filter": "NetIbMPITest.GetDevicePropertiesInvalidDevice" + } + ] + }, + "net_ib_memory": { + "extends": "net_ib_base", + "timeout": 120, + "tests": [ + { + "name": "NetIB_Mem_RegisterHost", + "description": "Test InfiniBand registration of host memory buffers", + "test_filter": "NetIbMPITest.RegisterHostMemory" + }, + { + "name": "NetIB_Mem_RegisterGpu", + "description": "Test InfiniBand registration of GPU device memory buffers", + "test_filter": "NetIbMPITest.RegisterGpuMemory" + } + ] + }, + "net_ib_transfer": { + "extends": "net_ib_base", + "env_variables": { + "NCCL_DEBUG": "TRACE", + "NCCL_DEBUG_SUBSYS": "NET,INIT" + }, + "tests": [ + { + "name": "NetIB_Xfer_SimpleSendRecv", + "description": "Basic InfiniBand send/receive data transfer test", + "test_filter": "NetIbMPITest.SimpleSendRecv", + "timeout": 180, + "env_variables": { + "RCCL_MPI_LOG_ALL_RANKS": "1" + } + }, + { + "name": "NetIB_Xfer_MultipleSizes", + "description": "InfiniBand data transfer with multiple buffer sizes", + "test_filter": "NetIbMPITest.SendRecvMultipleSizes", + "timeout": 300 + }, + { + "name": "NetIB_Stress_LargeTransfer", + "description": "Stress test for large data transfers over InfiniBand", + "test_filter": "NetIbMPITest.LargeTransfer", + "timeout": 300 + } + ] + }, + "unit_tests_fixtures": { + "is_gtest": true, + "binary": "rccl-UnitTestsFixtures", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 8, + "timeout": 0, + "tests": [ + { + "name": "NetIb_Debug", + "description": "InfiniBand unit tests with debug output using test fixtures", + "test_filter": "NetIbTests.*", + "env_variables": { + "NCCL_SOCKET_IFNAME": "eth1" + } + }, + { + "name": "Rcclwrap_All", + "description": "RCCL wrapper API unit tests with trace-level debugging", + "test_filter": "Rcclwrap.*", + "env_variables": { + "NCCL_DEBUG": "TRACE" + } + }, + { + "name": "Fixtures_All", + "description": "All Fixtures tests", + "env_variables": { + "NCCL_DEBUG": "TRACE" + } + } + ] + }, + "unit_tests_standard": { + "is_gtest": true, + "binary": "rccl-UnitTests", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 8, + "timeout": 0, + "env_variables": { + "NCCL_DEBUG": "" + }, + "tests": [ + {"name": "AllReduce.OutOfPlace", "description": "AllReduce out-of-place", "test_filter": "AllReduce.OutOfPlace"}, + {"name": "AllReduce.OutOfPlaceGraph", "description": "AllReduce out-of-place graph", "test_filter": "AllReduce.OutOfPlaceGraph"}, + {"name": "AllReduce.InPlace", "description": "AllReduce in-place", "test_filter": "AllReduce.InPlace"}, + {"name": "AllReduce.InPlaceGraph", "description": "AllReduce in-place graph", "test_filter": "AllReduce.InPlaceGraph"}, + {"name": "AllReduce.ManagedMem", "description": "AllReduce managed memory", "test_filter": "AllReduce.ManagedMem"}, + {"name": "AllReduce.Channels", "description": "AllReduce channels", "test_filter": "AllReduce.Channels"}, + {"name": "AllReduce.ManagedMemGraph", "description": "AllReduce managed memory graph", "test_filter": "AllReduce.ManagedMemGraph"}, + {"name": "AllReduce.PreMultScalar", "description": "AllReduce pre-mult scalar", "test_filter": "AllReduce.PreMultScalar"}, + {"name": "AllReduce.UserBufferRegistration", "description": "AllReduce user buffer registration", "test_filter": "AllReduce.UserBufferRegistration"}, + {"name": "AllReduce.ManagedMemUserBufferRegistration", "description": "AllReduce managed mem user buffer", "test_filter": "AllReduce.ManagedMemUserBufferRegistration"}, + {"name": "AllGather.OutOfPlace", "description": "AllGather out-of-place", "test_filter": "AllGather.OutOfPlace"}, + {"name": "AllGather.OutOfPlaceGraph", "description": "AllGather out-of-place graph", "test_filter": "AllGather.OutOfPlaceGraph"}, + {"name": "AllGather.InPlace", "description": "AllGather in-place", "test_filter": "AllGather.InPlace"}, + {"name": "AllGather.InPlaceGraph", "description": "AllGather in-place graph", "test_filter": "AllGather.InPlaceGraph"}, + {"name": "AllGather.ManagedMem", "description": "AllGather managed memory", "test_filter": "AllGather.ManagedMem"}, + {"name": "AllGather.ManagedMemGraph", "description": "AllGather managed memory graph", "test_filter": "AllGather.ManagedMemGraph"}, + {"name": "AllGather.UserBufferRegistration", "description": "AllGather user buffer registration", "test_filter": "AllGather.UserBufferRegistration"}, + {"name": "AllGather.ManagedMemUserBufferRegistration", "description": "AllGather managed mem user buffer", "test_filter": "AllGather.ManagedMemUserBufferRegistration"}, + {"name": "AllToAll.OutOfPlace", "description": "AllToAll out-of-place", "test_filter": "AllToAll.OutOfPlace"}, + {"name": "AllToAll.OutOfPlaceGraph", "description": "AllToAll out-of-place graph", "test_filter": "AllToAll.OutOfPlaceGraph"}, + {"name": "AllToAll.ManagedMem", "description": "AllToAll managed memory", "test_filter": "AllToAll.ManagedMem"}, + {"name": "AllToAll.ManagedMemGraph", "description": "AllToAll managed memory graph", "test_filter": "AllToAll.ManagedMemGraph"}, + {"name": "AllToAllv.OutOfPlace", "description": "AllToAllv out-of-place", "test_filter": "AllToAllv.OutOfPlace"}, + {"name": "AllToAllv.OutOfPlaceGraph", "description": "AllToAllv out-of-place graph", "test_filter": "AllToAllv.OutOfPlaceGraph"}, + {"name": "Broadcast.OutOfPlace", "description": "Broadcast out-of-place", "test_filter": "Broadcast.OutOfPlace"}, + {"name": "Broadcast.OutOfPlaceGraph", "description": "Broadcast out-of-place graph", "test_filter": "Broadcast.OutOfPlaceGraph"}, + {"name": "Broadcast.InPlace", "description": "Broadcast in-place", "test_filter": "Broadcast.InPlace"}, + {"name": "Broadcast.InPlaceGraph", "description": "Broadcast in-place graph", "test_filter": "Broadcast.InPlaceGraph"}, + {"name": "Broadcast.ManagedMem", "description": "Broadcast managed memory", "test_filter": "Broadcast.ManagedMem"}, + {"name": "Broadcast.ManagedMemGraph", "description": "Broadcast managed memory graph", "test_filter": "Broadcast.ManagedMemGraph"}, + {"name": "Gather.OutOfPlace", "description": "Gather out-of-place", "test_filter": "Gather.OutOfPlace"}, + {"name": "Gather.OutOfPlaceGraph", "description": "Gather out-of-place graph", "test_filter": "Gather.OutOfPlaceGraph"}, + {"name": "Gather.InPlace", "description": "Gather in-place", "test_filter": "Gather.InPlace"}, + {"name": "Gather.InPlaceGraph", "description": "Gather in-place graph", "test_filter": "Gather.InPlaceGraph"}, + {"name": "Gather.ManagedMem", "description": "Gather managed memory", "test_filter": "Gather.ManagedMem"}, + {"name": "Gather.ManagedMemGraph", "description": "Gather managed memory graph", "test_filter": "Gather.ManagedMemGraph"}, + {"name": "Scatter.OutOfPlace", "description": "Scatter out-of-place", "test_filter": "Scatter.OutOfPlace"}, + {"name": "Scatter.OutOfPlaceGraph", "description": "Scatter out-of-place graph", "test_filter": "Scatter.OutOfPlaceGraph"}, + {"name": "Scatter.InPlace", "description": "Scatter in-place", "test_filter": "Scatter.InPlace"}, + {"name": "Scatter.InPlaceGraph", "description": "Scatter in-place graph", "test_filter": "Scatter.InPlaceGraph"}, + {"name": "Scatter.ManagedMem", "description": "Scatter managed memory", "test_filter": "Scatter.ManagedMem"}, + {"name": "Scatter.ManagedMemGraph", "description": "Scatter managed memory graph", "test_filter": "Scatter.ManagedMemGraph"}, + {"name": "Reduce.OutOfPlace", "description": "Reduce out-of-place", "test_filter": "Reduce.OutOfPlace"}, + {"name": "Reduce.OutOfPlaceGraph", "description": "Reduce out-of-place graph", "test_filter": "Reduce.OutOfPlaceGraph"}, + {"name": "Reduce.InPlace", "description": "Reduce in-place", "test_filter": "Reduce.InPlace"}, + {"name": "Reduce.InPlaceGraph", "description": "Reduce in-place graph", "test_filter": "Reduce.InPlaceGraph"}, + {"name": "Reduce.ManagedMem", "description": "Reduce managed memory", "test_filter": "Reduce.ManagedMem"}, + {"name": "Reduce.ManagedMemGraph", "description": "Reduce managed memory graph", "test_filter": "Reduce.ManagedMemGraph"}, + {"name": "ReduceScatter.OutOfPlace", "description": "ReduceScatter out-of-place", "test_filter": "ReduceScatter.OutOfPlace"}, + {"name": "ReduceScatter.OutOfPlaceGraph", "description": "ReduceScatter out-of-place graph", "test_filter": "ReduceScatter.OutOfPlaceGraph"}, + {"name": "ReduceScatter.InPlace", "description": "ReduceScatter in-place", "test_filter": "ReduceScatter.InPlace"}, + {"name": "ReduceScatter.InPlaceGraph", "description": "ReduceScatter in-place graph", "test_filter": "ReduceScatter.InPlaceGraph"}, + {"name": "ReduceScatter.ManagedMem", "description": "ReduceScatter managed memory", "test_filter": "ReduceScatter.ManagedMem"}, + {"name": "ReduceScatter.ManagedMemGraph", "description": "ReduceScatter managed memory graph", "test_filter": "ReduceScatter.ManagedMemGraph"}, + {"name": "SendRecv.SinglePairs", "description": "SendRecv single pairs", "test_filter": "SendRecv.SinglePairs"}, + {"name": "SendRecv.UserBufferRegister", "description": "SendRecv user buffer register", "test_filter": "SendRecv.UserBufferRegister"}, + {"name": "GroupCall.Identical", "description": "GroupCall identical", "test_filter": "GroupCall.Identical"}, + {"name": "GroupCall.Different", "description": "GroupCall different", "test_filter": "GroupCall.Different"}, + {"name": "GroupCall.Multistream", "description": "GroupCall multistream", "test_filter": "GroupCall.Multistream"}, + {"name": "GroupCall.MixedDataType", "description": "GroupCall mixed data type", "test_filter": "GroupCall.MixedDataType"}, + {"name": "GroupCall.MultiGroupCall", "description": "GroupCall multi group call", "test_filter": "GroupCall.MultiGroupCall"}, + {"name": "NonBlocking.SingleCalls", "description": "NonBlocking single calls", "test_filter": "NonBlocking.SingleCalls"}, + {"name": "CommTests.Sorter", "description": "CommTests sorter", "test_filter": "CommTests.Sorter"}, + {"name": "Enqueue", "description": "Enqueue operation tests", "test_filter": "Enqueue.*"}, + {"name": "Alloc", "description": "Memory allocation tests", "test_filter": "Alloc.*"}, + {"name": "ParamTests", "description": "Parameter handling tests", "test_filter": "ParamTests.*"}, + {"name": "ProxyTests", "description": "Proxy service tests", "test_filter": "ProxyTests.*"}, + {"name": "Rcclwrap", "description": "RCCL wrapper tests", "test_filter": "Rcclwrap.*"}, + {"name": "TransportTest", "description": "Transport layer tests", "test_filter": "TransportTest.*"}, + {"name": "ArgCheck", "description": "Argument validation tests", "test_filter": "ArgCheck.*"}, + {"name": "BitOps", "description": "Bit operation utility tests", "test_filter": "ALIGN_*:DIVUP:ROUNDUP:u32fp*:*Hash"}, + {"name": "AltRsmi", "description": "Alternative RSMI tests", "test_filter": "AltRsmi.*"}, + {"name": "NetSocket", "description": "Network socket tests", "test_filter": "NetSocket.*"}, + {"name": "Ipcsocket", "description": "IPC socket tests", "test_filter": "Ipcsocket.*"}, + {"name": "Standalone.SplitComms_RankCheck", "description": "Verify device assignment for each rank using ncclCommSplit API", "test_filter": "Standalone.SplitComms_RankCheck"}, + {"name": "Standalone.SplitComms_OneColor", "description": "Creates communicator for each device with same color", "test_filter": "Standalone.SplitComms_OneColor"}, + {"name": "Standalone.SplitComms_Reduce", "description": "Reduces communicators into fewer ranks", "test_filter": "Standalone.SplitComms_Reduce"}, + {"name": "Standalone.RegressionTiming", "description": "Verify no timing regression for protocols (LL, LL128, Simple)", "test_filter": "Standalone.RegressionTiming"}, + {"name": "Standalone.StackSize", "description": "Verify RCCL kernel stack size for each gfx architecture", "test_filter": "Standalone.StackSize"}, + {"name": "Standalone.CommCuDevice_Check", "description": "Verify device associated with communicator in single/multi-device scenarios", "test_filter": "Standalone.CommCuDevice_Check"}, + {"name": "Standalone.SplitComms_RankCheck_Basic_Failure", "description": "Verify ncclCommUserRank fails with invalid communicator handle", "test_filter": "Standalone.SplitComms_RankCheck_Basic_Failure"} + ] + }, + "debug_tests": { + "is_gtest": true, + "binary": "rccl-UnitTests", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 8, + "timeout": 0, + "env_variables": { + "NCCL_DEBUG": "VERSION", + "NCCL_DEBUG_SUBSYS": "ALL" + }, + "tests": [ + { + "name": "Debug_ThreadName", + "description": "Test thread naming functionality with AllToAll operation", + "test_filter": "AllToAll.OutOfPlaceGraph*", + "env_variables": { + "NCCL_SET_THREAD_NAME": "1" + } + }, + { + "name": "Debug_AllSubsystems", + "description": "Test debug logging for all RCCL subsystems with trace-level output", + "test_filter": "AllToAll.OutOfPlaceGraph*", + "env_variables": { + "NCCL_DEBUG": "TRACE", + "NCCL_DEBUG_FILE": "cdcvg.dbg", + "NCCL_DEBUG_SUBSYS": "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC,CALL,PROXY,NVLS,BOOTSTRAP,REG,PROFILE,RAS,VERBS" + } + } + ] + }, + "alt_rsmi_tests": { + "is_gtest": true, + "binary": "rccl-UnitTestsFixtures", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 1, + "timeout": 120, + "tests": [ + { + "name": "AltRsmi_AllTests", + "description": "All Alternative RSMI implementation tests using public API only", + "test_filter": "AltRsmiTest.*" + } + ] + } + }, + "test_suites": [ + { + "name": "SHM Tests - Complete Suite", + "description": "All shared memory transport tests (single node)", + "config": "shm_comprehensive", + "enabled": true + }, + { + "name": "P2P Tests - Complete Suite", + "description": "All peer-to-peer transport tests (single node)", + "config": "p2p_comprehensive", + "enabled": true + }, + { + "name": "NET Transport - Ethernet (Multi-Node)", + "description": "Network transport tests over Ethernet", + "config": "net_transport_eth_multinode", + "enabled": true + }, + { + "name": "NET IB - Initialization Tests", + "description": "InfiniBand plugin initialization and device enumeration", + "config": "net_ib_initialization", + "enabled": true + }, + { + "name": "NET IB - Device Properties", + "description": "InfiniBand device property queries", + "config": "net_ib_properties", + "enabled": true + }, + { + "name": "NET IB - Memory Registration", + "description": "InfiniBand memory registration tests", + "config": "net_ib_memory", + "enabled": true + }, + { + "name": "NET IB - Data Transfer", + "description": "InfiniBand data transfer and stress tests", + "config": "net_ib_transfer", + "enabled": true + }, + { + "name": "Unit Tests - Fixtures", + "description": "Non-MPI unit tests using fixtures", + "config": "unit_tests_fixtures", + "enabled": true + }, + { + "name": "Unit Tests - Standard Collectives", + "description": "Basic collective operation tests", + "config": "unit_tests_standard", + "enabled": true + }, + { + "name": "Debug and Logging Tests", + "description": "Tests for debug output and logging functionality", + "config": "debug_tests", + "enabled": true + }, + { + "name": "AltRsmi Tests - Complete Suite", + "description": "All Alternative RSMI tests using public API only", + "config": "alt_rsmi_tests", + "enabled": true + } + ] +} diff --git a/projects/rccl/tools/scripts/test_runner/configs/rccl_perf_tests.json b/projects/rccl/tools/scripts/test_runner/configs/rccl_perf_tests.json new file mode 100644 index 0000000000..bdf1810a0c --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/configs/rccl_perf_tests.json @@ -0,0 +1,458 @@ +{ + "system_configurations": { + "name": "RCCL-Performance-Benchmarks", + "description": "RCCL Performance Test Suite - All Collective Operations" + }, + "paths": { + "workdir": "${WORKDIR:-/path/to/rccl}", + "rocm_path": "${ROCM_PATH:-/opt/rocm}", + "mpi_path": "${MPI_PATH:-/opt/ompi}", + "test_binary_dir": "${RCCL_TEST_BIN_DIR}" + }, + "env_variables": { + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_DEBUG": "WARN" + }, + "build_configuration": { + "cmake_options": { + "CMAKE_BUILD_TYPE": "Release", + "ENABLE_CODE_COVERAGE": "OFF", + "BUILD_TESTS": "ON", + "BUILD_LOCAL_GPU_TARGET_ONLY": "ON", + "TRACE": "OFF", + "COLLTRACE": "OFF" + }, + "env_variables": { + "HIPCC_COMPILE_FLAGS_APPEND": "-O3" + }, + "parallel_jobs": 64, + "generator": "Unix Makefiles" + }, + "test_configurations": { + "perf_base": { + "is_gtest": false, + "num_ranks": 8, + "num_nodes": 1, + "timeout": 300, + "env_variables": { + "NCCL_LAUNCH_MODE": "GROUP" + } + }, + "allreduce_perf": { + "extends": "perf_base", + "binary": "all_reduce_perf", + "tests": [ + { + "name": "AllReduce_Perf_SmallMessages", + "description": "AllReduce bandwidth test for small messages (8B - 8KB)", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "AllReduce_Perf_MediumMessages", + "description": "AllReduce bandwidth test for medium messages (16KB - 1MB)", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "AllReduce_Perf_LargeMessages", + "description": "AllReduce bandwidth test for large messages (2MB - 128MB)", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + }, + { + "name": "AllReduce_Perf_InPlace", + "description": "AllReduce in-place bandwidth test", + "command_args": "-b 8 -e 128M -f 2 -g 1 -c 1" + }, + { + "name": "AllReduce_Perf_MultiGPU", + "description": "AllReduce test with all 8 GPUs", + "command_args": "-b 1M -e 128M -f 2 -g 8" + } + ] + }, + "allgather_perf": { + "extends": "perf_base", + "binary": "all_gather_perf", + "tests": [ + { + "name": "AllGather_Perf_SmallMessages", + "description": "AllGather bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "AllGather_Perf_MediumMessages", + "description": "AllGather bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "AllGather_Perf_LargeMessages", + "description": "AllGather bandwidth test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + } + ] + }, + "broadcast_perf": { + "extends": "perf_base", + "binary": "broadcast_perf", + "tests": [ + { + "name": "Broadcast_Perf_SmallMessages", + "description": "Broadcast bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "Broadcast_Perf_MediumMessages", + "description": "Broadcast bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "Broadcast_Perf_LargeMessages", + "description": "Broadcast bandwidth test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + } + ] + }, + "reduce_perf": { + "extends": "perf_base", + "binary": "reduce_perf", + "tests": [ + { + "name": "Reduce_Perf_SmallMessages", + "description": "Reduce bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "Reduce_Perf_MediumMessages", + "description": "Reduce bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "Reduce_Perf_LargeMessages", + "description": "Reduce bandwidth test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + } + ] + }, + "reducescatter_perf": { + "extends": "perf_base", + "binary": "reduce_scatter_perf", + "tests": [ + { + "name": "ReduceScatter_Perf_SmallMessages", + "description": "ReduceScatter bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "ReduceScatter_Perf_MediumMessages", + "description": "ReduceScatter bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "ReduceScatter_Perf_LargeMessages", + "description": "ReduceScatter bandwidth test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + } + ] + }, + "alltoall_perf": { + "extends": "perf_base", + "binary": "alltoall_perf", + "tests": [ + { + "name": "AllToAll_Perf_SmallMessages", + "description": "AllToAll bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "AllToAll_Perf_MediumMessages", + "description": "AllToAll bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "AllToAll_Perf_LargeMessages", + "description": "AllToAll bandwidth test for large messages", + "command_args": "-b 2M -e 64M -f 2 -g 1", + "timeout": 600 + } + ] + }, + "sendrecv_perf": { + "extends": "perf_base", + "binary": "sendrecv_perf", + "num_ranks": 2, + "tests": [ + { + "name": "SendRecv_Perf_SmallMessages", + "description": "SendRecv bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "SendRecv_Perf_MediumMessages", + "description": "SendRecv bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "SendRecv_Perf_LargeMessages", + "description": "SendRecv bandwidth test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 600 + }, + { + "name": "SendRecv_Perf_Latency", + "description": "SendRecv latency test", + "command_args": "-b 8 -e 8 -f 1 -g 1 -n 1000" + } + ] + }, + "allreduce_multinode": { + "extends": "perf_base", + "binary": "all_reduce_perf", + "num_ranks": 16, + "num_nodes": 2, + "timeout": 600, + "env_variables": { + "NCCL_IB_DISABLE": "0", + "NCCL_NET_GDR_LEVEL": "5", + "NCCL_SOCKET_IFNAME": "eth0,eth1" + }, + "tests": [ + { + "name": "AllReduce_MultiNode_SmallMessages", + "description": "Multi-node AllReduce test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "AllReduce_MultiNode_MediumMessages", + "description": "Multi-node AllReduce test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "AllReduce_MultiNode_LargeMessages", + "description": "Multi-node AllReduce test for large messages", + "command_args": "-b 2M -e 128M -f 2 -g 1", + "timeout": 900 + }, + { + "name": "AllReduce_MultiNode_MaxBandwidth", + "description": "Multi-node AllReduce maximum bandwidth test", + "command_args": "-b 128M -e 2G -f 2 -g 8", + "timeout": 1200 + } + ] + }, + "scatter_gather_perf": { + "extends": "perf_base", + "binary": "scatter_perf", + "tests": [ + { + "name": "Scatter_Perf_SmallMessages", + "description": "Scatter bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "Scatter_Perf_MediumMessages", + "description": "Scatter bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "Scatter_Perf_LargeMessages", + "description": "Scatter bandwidth test for large messages", + "command_args": "-b 2M -e 64M -f 2 -g 1" + } + ] + }, + "gather_perf": { + "extends": "perf_base", + "binary": "gather_perf", + "tests": [ + { + "name": "Gather_Perf_SmallMessages", + "description": "Gather bandwidth test for small messages", + "command_args": "-b 8 -e 8K -f 2 -g 1" + }, + { + "name": "Gather_Perf_MediumMessages", + "description": "Gather bandwidth test for medium messages", + "command_args": "-b 16K -e 1M -f 2 -g 1" + }, + { + "name": "Gather_Perf_LargeMessages", + "description": "Gather bandwidth test for large messages", + "command_args": "-b 2M -e 64M -f 2 -g 1" + } + ] + }, + "allreduce_algos": { + "extends": "perf_base", + "binary": "all_reduce_perf", + "num_ranks": 8, + "tests": [ + { + "name": "AllReduce_Ring_Algorithm", + "description": "AllReduce using Ring algorithm", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_ALGO": "Ring" + } + }, + { + "name": "AllReduce_Tree_Algorithm", + "description": "AllReduce using Tree algorithm", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_ALGO": "Tree" + } + }, + { + "name": "AllReduce_CollNetDirect", + "description": "AllReduce using CollNet Direct", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_ALGO": "CollNetDirect" + } + } + ] + }, + "allreduce_protocols": { + "extends": "perf_base", + "binary": "all_reduce_perf", + "num_ranks": 8, + "tests": [ + { + "name": "AllReduce_SimpleProtocol", + "description": "AllReduce using Simple protocol", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_PROTO": "Simple" + } + }, + { + "name": "AllReduce_LL_Protocol", + "description": "AllReduce using LL (Low Latency) protocol", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_PROTO": "LL" + } + }, + { + "name": "AllReduce_LL128_Protocol", + "description": "AllReduce using LL128 protocol", + "command_args": "-b 1M -e 128M -f 2 -g 1", + "env_variables": { + "NCCL_PROTO": "LL128" + } + } + ] + }, + "stress_tests": { + "extends": "perf_base", + "binary": "all_reduce_perf", + "num_ranks": 8, + "timeout": 1800, + "tests": [ + { + "name": "AllReduce_Stress_LongDuration", + "description": "Long duration AllReduce stress test", + "command_args": "-b 1M -e 128M -f 2 -g 8 -n 10000" + }, + { + "name": "AllReduce_Stress_MaxSize", + "description": "Maximum message size stress test", + "command_args": "-b 1G -e 2G -f 2 -g 8", + "timeout": 2400 + }, + { + "name": "AllReduce_Stress_AllSizes", + "description": "All message sizes comprehensive test", + "command_args": "-b 8 -e 2G -f 2 -g 8 -n 100", + "timeout": 3600 + } + ] + } + }, + "test_suites": [ + { + "name": "AllReduce Performance Tests", + "description": "AllReduce collective bandwidth and latency benchmarks", + "config": "allreduce_perf", + "enabled": true + }, + { + "name": "AllGather Performance Tests", + "description": "AllGather collective bandwidth benchmarks", + "config": "allgather_perf", + "enabled": true + }, + { + "name": "Broadcast Performance Tests", + "description": "Broadcast collective bandwidth benchmarks", + "config": "broadcast_perf", + "enabled": true + }, + { + "name": "Reduce Performance Tests", + "description": "Reduce collective bandwidth benchmarks", + "config": "reduce_perf", + "enabled": true + }, + { + "name": "ReduceScatter Performance Tests", + "description": "ReduceScatter collective bandwidth benchmarks", + "config": "reducescatter_perf", + "enabled": true + }, + { + "name": "AllToAll Performance Tests", + "description": "AllToAll collective bandwidth benchmarks", + "config": "alltoall_perf", + "enabled": true + }, + { + "name": "SendRecv Performance Tests", + "description": "Point-to-point SendRecv bandwidth and latency benchmarks", + "config": "sendrecv_perf", + "enabled": true + }, + { + "name": "Scatter Performance Tests", + "description": "Scatter collective bandwidth benchmarks", + "config": "scatter_gather_perf", + "enabled": false + }, + { + "name": "Gather Performance Tests", + "description": "Gather collective bandwidth benchmarks", + "config": "gather_perf", + "enabled": false + }, + { + "name": "AllReduce Multi-Node Tests", + "description": "Multi-node AllReduce performance tests (requires 2+ nodes)", + "config": "allreduce_multinode", + "enabled": false + }, + { + "name": "AllReduce Algorithm Comparison", + "description": "Compare different AllReduce algorithms (Ring, Tree, CollNet)", + "config": "allreduce_algos", + "enabled": false + }, + { + "name": "AllReduce Protocol Comparison", + "description": "Compare different protocols (Simple, LL, LL128)", + "config": "allreduce_protocols", + "enabled": false + }, + { + "name": "Stress Tests", + "description": "Long duration and maximum size stress tests", + "config": "stress_tests", + "enabled": false + } + ] +} + diff --git a/projects/rccl/tools/scripts/test_runner/configs/test_config_sample.json b/projects/rccl/tools/scripts/test_runner/configs/test_config_sample.json new file mode 100644 index 0000000000..d67191084b --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/configs/test_config_sample.json @@ -0,0 +1,126 @@ +{ + "system_configurations": { + "name": "rccl-test-system", + "description": "Optional description of the system" + }, + + "paths": { + "workdir": "${WORKDIR:-/path/to/rccl}", + "rocm_path": "${ROCM_PATH:-/opt/rocm}", + "mpi_path": "${MPI_PATH:-/opt/ompi}", + "test_binary_dir": "${RCCL_TEST_BIN_DIR:-build/test}" + }, + + "env_variables": { + "HSA_NO_SCRATCH_RECLAIM": "1", + "NCCL_DEBUG": "WARN" + }, + + "build_configuration": { + "cmake_options": { + "CMAKE_BUILD_TYPE": "Release", + "BUILD_TESTS": "ON" + }, + "env_variables": { + "HIPCC_COMPILE_FLAGS_APPEND": "-O2" + }, + "parallel_jobs": 64, + "generator": "Unix Makefiles" + }, + + "test_configurations": { + "base_config": { + "env_variables": { + "NCCL_LAUNCH_MODE": "GROUP" + }, + "args": ["--verbose"], + "mpi_args": ["--bind-to none"] + }, + + "gtest_config": { + "extends": "base_config", + "is_gtest": true, + "binary": "rccl-UnitTests", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 8, + "timeout": 120, + "env_variables": { + "NCCL_DEBUG": "INFO" + }, + "tests": [ + { + "name": "AllReduceTest", + "description": "Test AllReduce with specific parameters", + "is_gtest": true, + "binary": "rccl-UnitTests", + "test_filter": "AllReduce.InPlace", + "command_args": "--gtest_also_run_disabled_tests", + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 4, + "timeout": 60, + "env_variables": { + "NCCL_DEBUG": "TRACE" + } + }, + { + "name": "BroadcastTest", + "test_filter": "Broadcast.*" + } + ] + }, + + "mpi_config": { + "extends": "base_config", + "binary": "rccl-UnitTestsMPI", + "num_ranks": 2, + "num_nodes": 1, + "timeout": 180, + "tests": [ + {"name": "P2pTest", "test_filter": "P2pMPITest.*"}, + {"name": "ShmTest", "test_filter": "ShmMPITest.*"} + ] + }, + + "perf_config": { + "is_gtest": false, + "binary": "all_reduce_perf", + "num_ranks": 8, + "num_nodes": 2, + "num_gpus": 4, + "timeout": 300, + "tests": [ + { + "name": "AllReducePerf", + "command_args": "-b 8 -e 128M -f 2 -g 1" + } + ] + } + }, + + "test_suites": [ + { + "name": "unit_tests", + "description": "Unit tests with GTest", + "config": "gtest_config", + "enabled": true, + "num_ranks": 1, + "num_nodes": 1, + "num_gpus": 8, + "timeout": 200, + "env_variables": { + "NCCL_DEBUG_SUBSYS": "INIT" + } + }, + { + "name": "mpi_tests", + "config": "mpi_config" + }, + { + "name": "perf_tests", + "config": "perf_config", + "enabled": false + } + ] +} diff --git a/projects/rccl/tools/scripts/test_runner/lib/__init__.py b/projects/rccl/tools/scripts/test_runner/lib/__init__.py new file mode 100644 index 0000000000..61372b54ff --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/lib/__init__.py @@ -0,0 +1,20 @@ +""" +RCCL Test Runner Library +Provides modules for test configuration, parsing, and execution +""" + +from .test_config import TestConfigProcessor +from .test_parser import ArgumentParserInterface, parse_test_output +from .test_executor import TestExecutor, ExitCode, TestResult + +__all__ = [ + 'TestConfigProcessor', + 'ArgumentParserInterface', + 'parse_test_output', + 'TestExecutor', + 'ExitCode', + 'TestResult' +] + +__version__ = '1.0.0' + diff --git a/projects/rccl/tools/scripts/test_runner/lib/test_config.py b/projects/rccl/tools/scripts/test_runner/lib/test_config.py new file mode 100644 index 0000000000..c9b1562a3f --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/lib/test_config.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE.txt for license information +""" +Test Configuration Processor Module +Handles hierarchical test configuration with inheritance and merging +""" + +import json +import os +import re +from copy import deepcopy +from pathlib import Path +from types import MappingProxyType + +# Set default WORKDIR to rccl root directory if not already defined +# This file is at: rccl/tools/scripts/test_runner/lib/test_config.py +# rccl root is 5 directories up +if "WORKDIR" not in os.environ: + _rccl_root = Path(__file__).resolve().parents[4] + os.environ["WORKDIR"] = str(_rccl_root) + + +class TestConfigProcessor: + """ + Processes hierarchical test configurations with support for: + - Configuration inheritance ('using' directive) + - Environment variable merging + - Test parameter inheritance + - Environment variable expansion in paths + """ + + def __init__(self, config_file): + """ + Initialize the TestConfigProcessor with the configuration file. + + Args: + config_file: Path to JSON configuration file + """ + if not os.path.exists(config_file): + raise FileNotFoundError(f"Configuration file not found: {config_file}") + + # Load the JSON configuration file + with open(config_file, 'r') as file: + config_data = json.load(file) + + # Expand environment variables in paths section + if "paths" in config_data: + config_data["paths"] = self._expand_env_vars_in_dict(config_data["paths"]) + + # Make the configuration immutable (frozen) + self.config = MappingProxyType(config_data) + self.config_file = config_file + + def _expand_env_var(self, value): + """ + Expand environment variables in a string. + + Supports both ${VAR} and $VAR syntax. + If an environment variable is not set, it will be left unexpanded + or replaced with an empty string based on the pattern. + + Args: + value: String that may contain environment variables + + Returns: + str: String with environment variables expanded + + Examples: + "${HOME}/code" -> "/home/user/code" + "$ROCM_PATH/bin" -> "/opt/rocm/bin" + "${UNDEFINED:-/default}" -> "/default" (bash-style default) + "${WORKDIR:-$HOME/code}" -> expands $HOME in default if WORKDIR not set + """ + if not isinstance(value, str): + return value + + # Pattern to match ${VAR}, ${VAR:-default}, or $VAR + # First, handle ${VAR:-default} pattern + def replace_with_default(match): + var_name = match.group(1) + default_value = match.group(2) + # Get the env var, or use default + result = os.environ.get(var_name) + if result is None: + # Recursively expand env vars in the default value + result = self._expand_env_var(default_value) + return result + + # Replace ${VAR:-default} patterns + value = re.sub(r'\$\{([A-Za-z_][A-Za-z0-9_]*):-([^}]*)\}', replace_with_default, value) + + # Replace ${VAR} patterns + value = re.sub(r'\$\{([A-Za-z_][A-Za-z0-9_]*)\}', + lambda m: os.environ.get(m.group(1), m.group(0)), value) + + # Replace $VAR patterns (but not ${ to avoid double replacement) + value = re.sub(r'\$([A-Za-z_][A-Za-z0-9_]*)', + lambda m: os.environ.get(m.group(1), m.group(0)), value) + + return value + + def _expand_env_vars_in_dict(self, data): + """ + Recursively expand environment variables in all string values in a dictionary. + + Args: + data: Dictionary that may contain environment variables in string values + + Returns: + dict: Dictionary with all environment variables expanded + """ + if isinstance(data, dict): + return {key: self._expand_env_vars_in_dict(value) for key, value in data.items()} + elif isinstance(data, list): + return [self._expand_env_vars_in_dict(item) for item in data] + elif isinstance(data, str): + return self._expand_env_var(data) + else: + return data + + def combine_configs(self, config_name): + """ + Combines configurations generically using the 'extends' directive. + + Merging rules: + - env_variables: Overwrite duplicate keys (child overwrites parent) + - mpi_args: Append and remove duplicates + - args: Append and remove duplicates + - tests: Merge by test name + - Other fields: Child overwrites parent + + Args: + config_name: Name of configuration to combine + + Returns: + dict: Combined configuration + """ + test_configs = self.config.get("test_configurations", {}) + if config_name not in test_configs: + raise ValueError( + f"Configuration '{config_name}' not found in test_configurations. " + f"Available: {', '.join(test_configs.keys())}" + ) + + # Start with a deep copy of the target configuration + combined_config = deepcopy(test_configs[config_name]) + + # Process the 'extends' directive if it exists + while "extends" in combined_config: + parent_configs = combined_config.pop("extends") + if not isinstance(parent_configs, list): + parent_configs = [parent_configs] + + for parent_config_name in parent_configs: + if parent_config_name not in test_configs: + raise ValueError( + f"Parent configuration '{parent_config_name}' not found." + ) + + parent_config = deepcopy(test_configs[parent_config_name]) + + # Recursively process parent's 'extends' directive + if "extends" in parent_config: + parent_config = self.combine_configs(parent_config_name) + + # Merge all keys from parent into combined configuration + for key, value in parent_config.items(): + if key == "env_variables": + # Merge env_variables (child overwrites parent) + current_env = combined_config.get("env_variables", {}) + combined_env = {**value, **current_env} + combined_config["env_variables"] = combined_env + elif key in ["args", "mpi_args"]: + # Append lists and remove duplicates (preserve order) + current_items = combined_config.get(key, []) + if isinstance(current_items, list) and isinstance(value, list): + combined_config[key] = list(dict.fromkeys(value + current_items)) + elif isinstance(value, list): + combined_config[key] = value + elif key == "tests": + # Merge tests by name + current_tests = combined_config.get("tests", []) + combined_tests = self._merge_tests(value, current_tests) + combined_config["tests"] = combined_tests + else: + # Child overwrites parent for other keys + if key not in combined_config: + combined_config[key] = value + + return combined_config + + def _merge_tests(self, parent_tests, child_tests): + """ + Merges two lists of tests by name. + + Args: + parent_tests: List of parent tests + child_tests: List of child tests + + Returns: + list: Merged list of tests + """ + merged_tests = [] + test_map = {} + + # Process parent tests + for test in parent_tests: + if isinstance(test, str): + test_map[test] = {"name": test} + elif isinstance(test, dict): + name = test.get("name") + if name: + test_map[name] = test + + # Process child tests (child overwrites parent) + for test in child_tests: + if isinstance(test, str): + test_map[test] = {"name": test} + elif isinstance(test, dict): + name = test.get("name") + if name: + # Merge with parent test if exists + if name in test_map: + parent_test = test_map[name] + merged_test = {**parent_test, **test} + test_map[name] = merged_test + else: + test_map[name] = test + + # Convert map back to list + merged_tests = list(test_map.values()) + return merged_tests + + def _apply_test_defaults(self, tests, config_defaults): + """ + Apply configuration-level defaults to individual tests. + + Test-specific values override configuration defaults. + + Args: + tests: List of test dictionaries + config_defaults: Dictionary with default values from configuration + + Returns: + list: Tests with defaults applied + """ + # Fields that can have defaults at config level + default_fields = ["is_gtest", "binary", "num_ranks", "num_nodes", "num_gpus", "timeout"] + + processed_tests = [] + for test in tests: + # Start with config defaults + merged_test = {} + + # Apply defaults for each field if not already in test + for field in default_fields: + if field in config_defaults: + merged_test[field] = config_defaults[field] + + # Override with test-specific values + merged_test.update(test) + + processed_tests.append(merged_test) + + return processed_tests + + def parse_test_suites(self): + """ + Parses the test_suites section and processes each test suite. + + Applies hierarchical defaults in order (test-specific overrides suite, suite overrides config): + 1. Configuration-level defaults + 2. Test suite-level defaults (override config) + 3. Individual test values (override both) + + Returns: + list: List of combined configurations for each test suite + """ + test_suites = self.config.get("test_suites", []) + combined_suites = [] + + for suite in test_suites: + config_name = suite.get("config") + if not config_name: + raise ValueError( + f"Test suite '{suite.get('name')}' does not specify a configuration." + ) + + # Combine the configuration for the test suite + combined_config = self.combine_configs(config_name) + + # Extract configuration-level defaults + config_defaults = { + "is_gtest": combined_config.get("is_gtest"), + "binary": combined_config.get("binary"), + "num_ranks": combined_config.get("num_ranks"), + "num_nodes": combined_config.get("num_nodes"), + "num_gpus": combined_config.get("num_gpus", 8), + "timeout": combined_config.get("timeout") + } + # Remove None values + config_defaults = {k: v for k, v in config_defaults.items() if v is not None} + + # Extract suite-level defaults (override config-level) + suite_defaults = { + "is_gtest": suite.get("is_gtest"), + "binary": suite.get("binary"), + "num_ranks": suite.get("num_ranks"), + "num_nodes": suite.get("num_nodes"), + "num_gpus": suite.get("num_gpus"), + "timeout": suite.get("timeout") + } + # Remove None values + suite_defaults = {k: v for k, v in suite_defaults.items() if v is not None} + + # Merge defaults: suite-level overrides config-level + merged_defaults = {**config_defaults, **suite_defaults} + + # Apply merged defaults to tests + tests = combined_config.get("tests", []) + if tests and merged_defaults: + combined_config["tests"] = self._apply_test_defaults(tests, merged_defaults) + + # Add suite-specific details + combined_config["suite_details"] = { + "name": suite.get("name"), + "description": suite.get("description", ""), + "num_nodes": suite.get("num_nodes", 1), + "num_ranks": suite.get("num_ranks", 1), + "num_gpus": suite.get("num_gpus", 8), + "enabled": suite.get("enabled", True) + } + + combined_suites.append(combined_config) + + return combined_suites + + def get_system_config(self): + """ + Get system-wide configuration settings. + + Returns: + dict: System configuration + """ + return self.config.get("system_configurations", {}) + + def get_env_variables(self): + """ + Get global environment variables. + + Returns: + dict: Global environment variables + """ + return self.config.get("env_variables", {}) + + def get_paths(self): + """ + Get system paths (ROCM, MPI, etc.). + + Returns: + dict: System paths + """ + return self.config.get("paths", {}) + + def get_build_config(self): + """ + Get build configuration settings. + + Returns: + dict: Build configuration with CMake options, environment variables, etc. + """ + return self.config.get("build_configuration", {}) + + def validate_config(self): + """ + Validate the configuration for required fields. + + Raises: + ValueError: If configuration is invalid + """ + # Check for required top-level keys + required_keys = ["test_configurations", "test_suites"] + for key in required_keys: + if key not in self.config: + raise ValueError(f"Missing required configuration key: {key}") + + # Validate test suites + test_suites = self.config.get("test_suites", []) + if not test_suites: + raise ValueError("No test suites defined in configuration") + + for suite in test_suites: + if "name" not in suite: + raise ValueError("Test suite missing 'name' field") + if "config" not in suite: + raise ValueError(f"Test suite '{suite['name']}' missing 'config' field") + + return True + diff --git a/projects/rccl/tools/scripts/test_runner/lib/test_executor.py b/projects/rccl/tools/scripts/test_runner/lib/test_executor.py new file mode 100644 index 0000000000..22b51079c4 --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/lib/test_executor.py @@ -0,0 +1,858 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE.txt for license information +""" +Test Executor Module +Handles test execution, build processes, and result tracking +""" + +import os +import subprocess +import sys +import time +import datetime +from enum import IntEnum, Enum +from pathlib import Path + +# Make stdout unbuffered to prevent output ordering issues with subprocesses +sys.stdout.reconfigure(line_buffering=True) + + +class ExitCode(IntEnum): + """Exit codes for processes""" + EXIT_SUCCESS = 0 + EXIT_FAILURE = 1 + EXIT_TIMEOUT = 124 + + +class TestResult(str, Enum): + """Test result statuses""" + RESULT_PASSED = "PASSED" + RESULT_FAILED = "FAILED" + RESULT_TIMEOUT = "TIMEOUT" + RESULT_SKIPPED = "SKIPPED" + + +class TestExecutor: + """ + Executes tests and manages build/test workflows + """ + + def __init__(self, config_processor, args): + """ + Initialize TestExecutor + + Args: + config_processor: TestConfigProcessor instance + args: Parsed command-line arguments + """ + self.config_processor = config_processor + self.args = args + self.system_config = config_processor.get_system_config() + self.paths = config_processor.get_paths() + self.global_env = config_processor.get_env_variables() + self.build_config = config_processor.get_build_config() + + # Setup directories + self.setup_directories() + + # Detect MPI hostfile once during initialization + self.mpi_hostfile = self._detect_mpi_hostfile() + + # Test tracking + self.test_results = [] + self.test_names = [] + self.test_durations = [] + self.test_suites = [] + + def setup_directories(self): + """Setup build and log directories""" + workdir = self.paths.get("workdir", os.getcwd()) + + # Determine workspace name (with or without timestamp) + suffix_part = f"_{self.args.report_suffix}" if self.args.report_suffix else "" + if self.args.overwrite: + workspace_name = f"rccl_test_artifacts{suffix_part}" + timestamp_suffix = "" + else: + timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S") + workspace_name = f"rccl_test_artifacts{suffix_part}_{timestamp}" + timestamp_suffix = f"_{timestamp}" + + # Create workspace directory path + self.workspace_dir = os.path.join(workdir, workspace_name) + + # Check for custom RCCL library path from environment variable + custom_rccl_path = os.environ.get('RCCL_LIB_PATH') or os.environ.get('RCCL_BUILD_DIR') + + if custom_rccl_path: + # Use custom library path from environment variable + self.build_dir = os.path.expanduser(os.path.expandvars(custom_rccl_path)) + self.using_custom_lib = True + if self.args.verbose: + print(f"Using custom RCCL library path from environment: {self.build_dir}") + else: + # Use default build directory + self.using_custom_lib = False + self.build_dir = os.path.join( + workdir, + f"build_debug_cov_on_tests_on{timestamp_suffix}" + ) + + # Set log and report directories under workspace + self.log_dir = os.path.join(self.workspace_dir, "logs") + self.report_dir = os.path.join(self.workspace_dir, "report") + + # Create directories (skip build_dir if using custom lib) + if not self.using_custom_lib: + os.makedirs(self.build_dir, exist_ok=True) + os.makedirs(self.log_dir, exist_ok=True) + os.makedirs(self.report_dir, exist_ok=True) + + if self.args.verbose: + print(f"Work directory: {workdir}") + print(f"Workspace directory: {self.workspace_dir}") + print(f"Build directory: {self.build_dir}") + if self.using_custom_lib: + print(f" (Using custom library from RCCL_LIB_PATH/RCCL_BUILD_DIR)") + print(f"Log directory: {self.log_dir}") + print(f"Report directory: {self.report_dir}") + + def _detect_mpi_hostfile(self): + """ + Detect MPI hostfile once during initialization. + Checks RCCL_TEST_MPI_HOSTFILE env var, then ~/.mpi_hostfile default. + Prints detection message only once. + + Returns: + str: Path to hostfile, or None if not found + """ + hostfile = os.environ.get('RCCL_TEST_MPI_HOSTFILE') + if hostfile and os.path.isfile(hostfile): + print(f"Using MPI hostfile from RCCL_TEST_MPI_HOSTFILE: {hostfile}") + return hostfile + + # Check default hostfile + default_hostfile = os.path.expanduser('~/.mpi_hostfile') + if os.path.isfile(default_hostfile): + print(f"Using default MPI hostfile: {default_hostfile}") + return default_hostfile + + # No hostfile found + return None + + def check_environment(self): + """ + Check that required environment and tools are available + + Returns: + bool: True if environment is valid + """ + errors = [] + + # Check ROCm + rocm_path = self.paths.get("rocm_path", "/opt/rocm") + if not os.path.isdir(rocm_path): + errors.append(f"ROCm not found at {rocm_path}") + + # Check MPI + mpi_path = self.paths.get("mpi_path") + if mpi_path: + if not os.path.isdir(mpi_path): + print(f"WARNING: MPI path not found: {mpi_path}") + elif not os.path.isfile(os.path.join(mpi_path, "bin", "mpirun")): + print(f"WARNING: mpirun not found in {mpi_path}/bin/") + + # Check RCCL library (if not building or using custom lib) + if self.args.no_build or self.using_custom_lib: + lib_path = os.path.join(self.build_dir, "librccl.so") + if not os.path.isfile(lib_path): + errors.append(f"RCCL library not found: {lib_path}") + elif self.args.verbose: + print(f"Found RCCL library: {lib_path}") + + if errors: + print("ERROR: Environment check failed:") + for error in errors: + print(f" - {error}") + return False + + if self.args.verbose: + print("Environment validation passed") + return True + + def build_rccl(self): + """ + Build RCCL with test support using configurable build settings + + Returns: + bool: True if build succeeded + """ + # Skip build if using custom library from environment variable + if self.using_custom_lib: + if self.args.verbose: + print("SKIP: Build step skipped (using custom RCCL library from environment)") + return True + + if self.args.no_build: + if self.args.verbose: + print("SKIP: Build step skipped (--no-build)") + return True + + print("="*80) + print("BUILDING RCCL") + print("="*80) + + workdir = self.paths.get("workdir", os.getcwd()) + rocm_path = self.paths.get("rocm_path", "/opt/rocm") + mpi_path = self.paths.get("mpi_path", "") + + # Get build configuration (with defaults) + cmake_options = self.build_config.get("cmake_options", {}) + build_env_vars = self.build_config.get("env_variables", {}) + parallel_jobs = self.build_config.get("parallel_jobs", 64) + generator = self.build_config.get("generator", "Unix Makefiles") + + if self.args.verbose: + print(f"Work directory: {workdir}") + print(f"ROCm path: {rocm_path}") + print(f"MPI path: {mpi_path}") + print(f"Build directory: {self.build_dir}") + print(f"Parallel jobs: {parallel_jobs}") + print(f"Generator: {generator}") + + # Setup environment for build + env = os.environ.copy() + + # Apply default environment variables for code coverage + default_env = { + 'HIPCC_COMPILE_FLAGS_APPEND': ( + "-g -Wno-format-nonliteral -Xarch_host -fprofile-instr-generate " + "-Xarch_host -fcoverage-mapping -parallel-jobs=16" + ), + 'HIPCC_LINK_FLAGS_APPEND': ( + "-fprofile-instr-generate -fcoverage-mapping -parallel-jobs=16" + ), + 'LLVM_PROFILE_FILE': "rccl_tests_%p_%m.profraw", + 'CXX': f"{rocm_path}/bin/amdclang++" + } + + # Merge with user-provided build environment variables (user values override defaults) + for key, value in default_env.items(): + env[key] = value + for key, value in build_env_vars.items(): + env[key] = str(value) + + # Build CMake configuration command with defaults + default_cmake_options = { + "CMAKE_CXX_FLAGS": "-Wl,--build-id=sha1", + "CMAKE_EXE_LINKER_FLAGS": "-Wl,--build-id=sha1", + "CMAKE_BUILD_TYPE": "Debug", + "ENABLE_CODE_COVERAGE": "ON", + "BUILD_TESTS": "ON", + "BUILD_LOCAL_GPU_TARGET_ONLY": "ON", + "TRACE": "ON", + "COLLTRACE": "ON", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_VERBOSE_MAKEFILE": "1", + "ENABLE_MPI_TESTS": "ON", + "MPI_PATH": mpi_path + } + + # Merge with user-provided CMake options (user values override defaults) + merged_cmake_options = {**default_cmake_options, **cmake_options} + + # Build CMake command + cmake_cmd = [ + "cmake", + "-S", workdir, + "-B", self.build_dir + ] + + # Add CMake options as -D flags + for key, value in merged_cmake_options.items(): + cmake_cmd.append(f"-D{key}={value}") + + # Add generator + cmake_cmd.append(f"-G{generator}") + + try: + print("Running CMake configuration...") + if self.args.verbose: + print(f"CMake command: {' '.join(cmake_cmd)}") + print(f"Build environment variables:") + for key, value in build_env_vars.items(): + print(f" {key}={value}") + + result = subprocess.run( + cmake_cmd, + cwd=workdir, + env=env, + capture_output=False + ) + + if result.returncode != 0: + print(f"ERROR: CMake configuration failed") + return False + + print("\nRunning CMake build...") + build_cmd = f"cmake --build {self.build_dir} --parallel {parallel_jobs}" + if self.args.verbose: + print(f"Build command: {build_cmd}") + + result = subprocess.run( + build_cmd, + shell=True, + cwd=workdir, + env=env, + capture_output=False + ) + + if result.returncode != 0: + print(f"ERROR: CMake build failed") + return False + + print("Build completed successfully") + return True + + except Exception as e: + print(f"ERROR: Build failed with exception: {e}") + return False + + def _resolve_binary_path(self, binary, test_config): + """ + Resolve the test binary path using multiple strategies: + 1. If binary is an absolute path -> use it directly + 2. If test_binary_dir is specified in config -> use as base directory + 3. If binary contains ${VAR} -> expand environment variables + 4. Otherwise -> use default build_dir/test/binary + + Args: + binary: Binary name or path from config + test_config: Test configuration dict + + Returns: + str: Resolved absolute path to the binary + """ + # Strategy 1: Check if binary is already an absolute path + if os.path.isabs(binary): + expanded_path = os.path.expandvars(binary) + return os.path.expanduser(expanded_path) + + # Strategy 2: Expand environment variables in binary path + if '$' in binary or '~' in binary: + expanded_path = os.path.expandvars(binary) + expanded_path = os.path.expanduser(expanded_path) + # If after expansion it becomes absolute, use it + if os.path.isabs(expanded_path): + return expanded_path + # Otherwise treat as relative to test_binary_dir or build_dir + binary = expanded_path + + # Strategy 3: Check for custom test_binary_dir in config + test_binary_dir = test_config.get("test_binary_dir", "") + if test_binary_dir: + # Expand environment variables in test_binary_dir + test_binary_dir = os.path.expandvars(test_binary_dir) + test_binary_dir = os.path.expanduser(test_binary_dir) + return os.path.join(test_binary_dir, binary) + + # Strategy 4: Check for test_binary_dir in paths config + if "test_binary_dir" in self.paths: + test_binary_dir = self.paths["test_binary_dir"] + # Expand environment variables in test_binary_dir + test_binary_dir = os.path.expandvars(test_binary_dir) + test_binary_dir = os.path.expanduser(test_binary_dir) + return os.path.join(test_binary_dir, binary) + + # Strategy 5: Default - use build_dir/test/binary + return os.path.join(self.build_dir, "test", binary) + + def run_test(self, test_config, suite_config): + """ + Run a single test + + Args: + test_config: Test configuration dict + suite_config: Test suite configuration dict + + Returns: + dict: Test result + """ + test_name = test_config.get("name") + is_gtest = test_config.get("is_gtest", True) # Default to True for backward compatibility + description = test_config.get("description", "") + binary = test_config.get("binary", "rccl-UnitTestsMPI") + + # Use test_filter for all test types + test_filter = test_config.get("test_filter", "*") + + num_ranks = test_config.get("num_ranks", 1) + num_nodes = test_config.get("num_nodes", 1) + num_gpus = test_config.get("num_gpus", 8) # GPUs per node (default: 8) + timeout = test_config.get("timeout", 0) + env_vars = test_config.get("env_variables", {}) + + # Support custom command arguments for non-gtest or specialized tests + custom_args = test_config.get("command_args", "") + + # Merge environment variables + merged_env = { + **self.global_env, + **suite_config.get("env_variables", {}), + **env_vars + } + + if self.args.verbose: + print(f"\n{'='*80}") + print(f"Test: {test_name}") + print(f"{'='*80}") + if description: + print(f" Description: {description}") + print(f" Type: {'gtest' if is_gtest else 'non-gtest'}") + print(f" Binary: {binary}") + print(f" Filter: {test_filter}") + print(f" Ranks: {num_ranks}") + print(f" Nodes: {num_nodes}") + print(f" GPUs/node: {num_gpus}") + print(f" Timeout: {timeout if timeout > 0 else 'unlimited'}") + print(f" Started: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Resolve binary path using flexible strategies + test_binary_path = self._resolve_binary_path(binary, test_config) + + if self.args.verbose: + print(f" Binary path: {test_binary_path}") + + if not os.path.isfile(test_binary_path): + print(f"ERROR: Test binary not found: {test_binary_path}") + return { + "name": test_name, + "result": TestResult.RESULT_FAILED.value, + "duration": 0, + "error": f"Binary not found: {test_binary_path}" + } + + # Setup environment + env = os.environ.copy() + + # Build LD_LIBRARY_PATH with build dir and MPI lib (if available) + mpi_path = self.paths.get("mpi_path", "") + ld_library_path_parts = [self.build_dir] + if mpi_path: + ld_library_path_parts.append(os.path.join(mpi_path, "lib")) + if env.get('LD_LIBRARY_PATH'): + ld_library_path_parts.append(env.get('LD_LIBRARY_PATH')) + env['LD_LIBRARY_PATH'] = ":".join(ld_library_path_parts) + + # Set LLVM_PROFILE_FILE for code coverage (prevents default.profraw collision) + env['LLVM_PROFILE_FILE'] = "rccl_tests_%p_%m.profraw" + + # Add test-specific env vars + for key, value in merged_env.items(): + env[key] = str(value) + + # Build command based on test type + if num_ranks == 1: + # Non-MPI test - prepend environment variables to the command + env_prefix = "" + for key, value in merged_env.items(): + env_prefix += f"{key}={value} " + + if is_gtest: + # GTest-based test - use --gtest_filter syntax + if test_filter == "ALL" or test_filter == "*": + cmd = f"{env_prefix}./{binary}" + else: + cmd = f"{env_prefix}./{binary} --gtest_filter={test_filter}" + + # Add custom arguments if provided + if custom_args: + cmd += f" {custom_args}" + else: + # Non-gtest test (perf, custom, etc.) - run binary with args + cmd = f"{env_prefix}./{binary}" + if custom_args: + cmd += f" {custom_args}" + + else: + # MPI test + mpi_path = self.paths.get("mpi_path", "") + mpi_cmd = f"{mpi_path}/bin/mpirun" if mpi_path else "mpirun" + + # Use cached hostfile detected during initialization + hostfile = self.mpi_hostfile + + # Warn if multi-node test without hostfile + if hostfile is None and num_nodes > 1: + print("WARNING: Multi-node test without hostfile") + + hostfile_arg = f"--hostfile {hostfile} " if hostfile else "" + + # Determine mapping strategy based on num_gpus and num_nodes + # Use PPR (processes per resource) to place num_gpus ranks per node + # This ignores the slots specification in the hostfile + if num_nodes > 1: + # Multi-node test: use ppr to control ranks per node + map_by_arg = f"--map-by ppr:{num_gpus}:node " + else: + # Single node: use default mapping (no need for ppr) + map_by_arg = "" + + mpi_args = ( + f"-np {num_ranks} " + f"{hostfile_arg}" + f"{map_by_arg}" + f"--mca btl ^vader,openib " + f"--mca pml ucx " + f"--bind-to none" + ) + + # Add environment variables for MPI + for key, value in merged_env.items(): + mpi_args += f" -x {key}={value}" + + # Pass the LD_LIBRARY_PATH + mpi_args += f" -x LD_LIBRARY_PATH={env['LD_LIBRARY_PATH']}" + + # Pass LLVM_PROFILE_FILE to MPI ranks for code coverage (prevents default.profraw collision) + mpi_args += f" -x LLVM_PROFILE_FILE=rccl_tests_%p_%m.profraw" + + # Build test command based on type + if is_gtest: + # GTest-based test - use --gtest_filter syntax + if test_filter == "ALL" or test_filter == "*": + cmd = f"{mpi_cmd} {mpi_args} ./{binary}" + else: + cmd = f"{mpi_cmd} {mpi_args} ./{binary} --gtest_filter={test_filter}" + + if custom_args: + cmd += f" {custom_args}" + else: + # Non-gtest test (perf, custom, etc.) - run binary with args + cmd = f"{mpi_cmd} {mpi_args} ./{binary}" + if custom_args: + cmd += f" {custom_args}" + + + if self.args.verbose: + print(f"\n Command: {cmd}") + print(f" Working directory: {os.path.join(self.build_dir, 'test')}") + print(f" LD_LIBRARY_PATH: {env.get('LD_LIBRARY_PATH', '')}") + print(f" LLVM_PROFILE_FILE: {env.get('LLVM_PROFILE_FILE', 'Not set')}\n") + + # Execute test + start_time = time.time() + try: + if timeout > 0: + result = subprocess.run( + cmd, + shell=True, + cwd=os.path.join(self.build_dir, "test"), + env=env, + capture_output=False, + timeout=timeout + ) + else: + result = subprocess.run( + cmd, + shell=True, + cwd=os.path.join(self.build_dir, "test"), + env=env, + capture_output=False + ) + + duration = time.time() - start_time + + # Determine result + if result.returncode == ExitCode.EXIT_SUCCESS: + test_result = TestResult.RESULT_PASSED.value + elif result.returncode == ExitCode.EXIT_TIMEOUT: + test_result = TestResult.RESULT_TIMEOUT.value + else: + test_result = TestResult.RESULT_FAILED.value + + if self.args.verbose: + print(f"\n Result: {test_result} ({duration:.3f} seconds)") + + return { + "name": test_name, + "result": test_result, + "duration": duration, + "exit_code": result.returncode + } + + except subprocess.TimeoutExpired: + duration = time.time() - start_time + if self.args.verbose: + print(f"\n Result: {TestResult.RESULT_TIMEOUT.value} after {timeout} seconds") + return { + "name": test_name, + "result": TestResult.RESULT_TIMEOUT.value, + "duration": duration, + "error": f"Test timed out after {timeout} seconds" + } + except Exception as e: + duration = time.time() - start_time + print(f"\n ERROR: {e}") + return { + "name": test_name, + "result": TestResult.RESULT_FAILED.value, + "duration": duration, + "error": str(e) + } + + def run_test_suite(self, suite_config): + """ + Run all tests in a test suite + + Args: + suite_config: Test suite configuration dict + + Returns: + list: List of test results + """ + suite_name = suite_config["suite_details"]["name"] + + if self.args.verbose: + print(f"\n{'='*80}") + print(f"TEST SUITE: {suite_name}") + print(f"{'='*80}") + + tests = suite_config.get("tests", []) + if not tests: + if self.args.verbose: + print(f"WARNING: No tests defined for test suite '{suite_name}'") + return [] + + results = [] + for test in tests: + # Filter by test name if specified + test_name = test.get("name") + if self.args.test_name and test_name != self.args.test_name: + continue + + result = self.run_test(test, suite_config) + results.append(result) + + self.test_names.append(test_name) + self.test_results.append(result["result"]) + self.test_durations.append(result["duration"]) + self.test_suites.append(suite_name) # Track suite name + + return results + + def print_summary(self): + """Print test execution summary""" + total_tests = len(self.test_results) + passed = self.test_results.count(TestResult.RESULT_PASSED.value) + failed = self.test_results.count(TestResult.RESULT_FAILED.value) + timeout = self.test_results.count(TestResult.RESULT_TIMEOUT.value) + + # Get unique test suites that were run + unique_suites = sorted(set(self.test_suites)) if self.test_suites else [] + + if total_tests > 0: + print("\nDetailed Results:") + print("-"*120) + print(f"{'Test Suite':<40} {'Test Name':<40} {'Result':<10} {'Duration'}") + print("-"*120) + for i in range(total_tests): + print( + f"{self.test_suites[i]:<40} " + f"{self.test_names[i]:<40} " + f"{self.test_results[i]:<10} " + f"{self.test_durations[i]:.3f} seconds" + ) + print("-"*120) + print(f"Total Tests: {total_tests}") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + print(f"Timeout: {timeout}") + print("="*120) + + def generate_coverage_report(self): + """Generate code coverage report""" + if not self.args.coverage_report: + return + + print(f"\n{'='*80}") + print("GENERATING COVERAGE REPORT") + print(f"{'='*80}") + + # Check for profraw files + import glob + import shutil + + profraw_files = glob.glob(os.path.join(self.build_dir, "**/*.profraw"), recursive=True) + + if not profraw_files: + print("WARNING: No profraw files found. Cannot generate coverage report.") + return + + print(f"Found {len(profraw_files)} profraw files") + + os.makedirs(self.report_dir, exist_ok=True) + + # Create rawfiles directory + rawfiles_dir = os.path.join(self.log_dir, "rawfiles") + os.makedirs(rawfiles_dir, exist_ok=True) + + # Move all profraw files into a single location + print("Copying profraw files...") + for profraw in profraw_files: + shutil.copy(profraw, rawfiles_dir) + + # Create a list of raw files to merge + rawprofiles_list = os.path.join(self.log_dir, "rawprofiles.list") + with open(rawprofiles_list, 'w') as f: + for profraw in glob.glob(os.path.join(rawfiles_dir, "*.profraw")): + f.write(f"{profraw}\n") + + # Get ROCm path for LLVM tools + rocm_path = self.paths.get("rocm_path", "/opt/rocm") + llvm_profdata = os.path.join(rocm_path, "lib", "llvm", "bin", "llvm-profdata") + llvm_cov = os.path.join(rocm_path, "lib", "llvm", "bin", "llvm-cov") + + # Create the merged profdata + print("Merging profraw files...") + merged_profdata = os.path.join(self.log_dir, "merged.profdata") + + merge_cmd = [ + llvm_profdata, + "merge", + "--sparse", + f"--input-files={rawprofiles_list}", + f"--output={merged_profdata}" + ] + + if self.args.verbose: + print(f"Merge command: {' '.join(merge_cmd)}") + + try: + result = subprocess.run( + merge_cmd, + capture_output=True, + text=True, + check=True + ) + print("Profraw files merged successfully") + if self.args.verbose: + print(f"Merged profdata file: {merged_profdata}") + except subprocess.CalledProcessError as e: + print(f"ERROR: Failed to merge profraw files") + print(f"Command: {' '.join(merge_cmd)}") + print(f"Error: {e.stderr}") + return + + # Build list of object files + object_files = [] + + librccl_so = os.path.join(self.build_dir, "librccl.so") + if os.path.isfile(librccl_so): + object_files.extend(["--object", librccl_so]) + if self.args.verbose: + print(f"Found library: {librccl_so}") + + # Add test binaries + test_dir = os.path.join(self.build_dir, "test") + for binary in ["rccl-UnitTestsFixtures", "rccl-UnitTests", "rccl-UnitTestsMPI"]: + binary_path = os.path.join(test_dir, binary) + if os.path.isfile(binary_path): + object_files.extend(["--object", binary_path]) + if self.args.verbose: + print(f"Found binary: {binary_path}") + + if not object_files: + print("WARNING: No object files found for coverage report") + return + + if self.args.verbose: + print(f"Total object files for coverage: {len(object_files) // 2}") + + # Ignore patterns for non-relevant files + ignore_regex = ( + ".*tuner_v.*|.*profiler_v.*|.*net_v.*|.*_deps.*|ext.*|" + ".*coll_net.*|.*nvls.*|.*nvml.*|.*nvtx.*|test/|.*gtest.*" + ) + + # Create the HTML report + print("Generating HTML coverage report...") + html_cmd = [ + llvm_cov, + "show", + f"--instr-profile={merged_profdata}", + "--format=html", + "--Xdemangler=c++filt", + f"--output-dir={self.report_dir}", + "--project-title=RCCL_Lib_Coverage_Report", + f"--ignore-filename-regex={ignore_regex}" + ] + html_cmd.extend(object_files) + + if self.args.verbose: + print(f"HTML coverage command: {' '.join(html_cmd)}") + + try: + result = subprocess.run( + html_cmd, + capture_output=True, + text=True, + check=True + ) + print(f"HTML coverage report generated: {self.report_dir}/index.html") + except subprocess.CalledProcessError as e: + print(f"ERROR: Failed to generate HTML coverage report") + print(f"Error: {e.stderr}") + if self.args.verbose: + print(f"Command was: {' '.join(html_cmd)}") + + # Generate function coverage summary (text report) + print("Generating text coverage report...") + text_report = os.path.join(self.report_dir, "function_coverage_report.txt") + + # Build command matching bash script exactly + text_cmd = [ + llvm_cov, + "report", + f"--instr-profile={merged_profdata}", + "--Xdemangler=c++filt" + ] + # Add object files first + text_cmd.extend(object_files) + # Add remaining options - matching bash script order + text_cmd.extend([ + f"--ignore-filename-regex={ignore_regex}", + "--show-functions", + "--sources", + self.build_dir + ]) + + if self.args.verbose: + print(f"Text coverage command: {' '.join(text_cmd)}") + + try: + with open(text_report, 'w') as f: + result = subprocess.run( + text_cmd, + stdout=f, + stderr=subprocess.PIPE, + text=True, + check=True + ) + print(f"Function coverage report generated: {text_report}") + + except subprocess.CalledProcessError as e: + print(f"ERROR: Failed to generate text coverage report") + print(f"Error: {e.stderr}") + if self.args.verbose: + print(f"Command was: {' '.join(text_cmd)}") + + print(f"\n{'='*80}") + print("COVERAGE REPORT GENERATION COMPLETE") + print(f"{'='*80}") + print(f"Report directory: {self.report_dir}") + print(f"HTML report: {self.report_dir}/index.html") + print(f"Text report: {text_report}") + diff --git a/projects/rccl/tools/scripts/test_runner/lib/test_parser.py b/projects/rccl/tools/scripts/test_runner/lib/test_parser.py new file mode 100644 index 0000000000..62f0b43f12 --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/lib/test_parser.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE.txt for license information +""" +Test Parser Module +Handles command-line argument parsing and test output parsing +""" + +import re +import argparse + + +class ArgumentParserInterface: + """Command-line argument parser for RCCL test runner""" + + def __init__(self): + self.parser = argparse.ArgumentParser( + description="RCCL Test Runner - Execute and manage RCCL unit tests and MPI tests", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run all tests from config + %(prog)s -c test_config.json + + # Run specific test + %(prog)s -c test_config.json --test-name NET_AllTests_2Nodes_ETH + + # Run with verbose output + %(prog)s -c test_config.json -v + + # Skip build and use existing build + %(prog)s -c test_config.json --no-build + + # Generate coverage report from existing data + %(prog)s -c test_config.json --no-build --skip-tests --coverage-report + """ + ) + + def add_arguments(self): + """Add all command-line arguments""" + self.parser.add_argument( + '-c', '--config', + type=str, + required=True, + help="Test configuration file (JSON format)" + ) + self.parser.add_argument( + '-v', '--verbose', + action='store_true', + help="Enable verbose output (detailed logging)" + ) + self.parser.add_argument( + '-o', '--output', + type=str, + help="Output directory for logs and reports (default: auto-generated)" + ) + self.parser.add_argument( + '--test-name', + type=str, + help="Run only specific test by name" + ) + self.parser.add_argument( + '--no-build', + action='store_true', + help="Skip build step and use existing build artifacts" + ) + self.parser.add_argument( + '--skip-tests', + action='store_true', + help="Skip test execution (useful with --coverage-report)" + ) + self.parser.add_argument( + '--coverage-report', + action='store_true', + help="Generate code coverage report from profraw files" + ) + self.parser.add_argument( + '--overwrite', + action='store_true', + help="Overwrite previous build/log directories (default: append timestamp)" + ) + self.parser.add_argument( + '--report-suffix', + type=str, + default='', + help="Suffix for report directory name (default: blank)" + ) + + def parse_arguments(self): + """Parse command-line arguments""" + return self.parser.parse_args() + + def process_arguments(self): + """Process and validate command-line arguments""" + self.add_arguments() + args = self.parse_arguments() + self.handle_arguments(args) + return args + + def handle_arguments(self, args): + """Handle and display parsed arguments""" + if args.verbose: + print("="*80) + print("RCCL Test Runner - Configuration") + print("="*80) + print(f"Config file: {args.config}") + print(f"Verbose mode: {args.verbose}") + print(f"Output dir: {args.output if args.output else 'auto-generated'}") + print(f"Test name filter: {args.test_name if args.test_name else 'all tests'}") + print(f"No build: {args.no_build}") + print(f"Skip tests: {args.skip_tests}") + print(f"Coverage report: {args.coverage_report}") + print(f"Overwrite: {args.overwrite}") + print(f"Report suffix: {args.report_suffix}") + print("="*80) + print() + + +def parse_test_output(output): + """ + Parse test output and extract results + + Args: + output: String containing test output + + Returns: + dict: Parsed test results including pass/fail status + """ + results = { + 'passed': False, + 'failed': False, + 'skipped': False, + 'tests_run': 0, + 'tests_passed': 0, + 'tests_failed': 0, + 'errors': [] + } + + # Google Test output patterns + gtest_passed = re.search(r'\[\s*PASSED\s*\]\s*(\d+)\s*test', output) + gtest_failed = re.search(r'\[\s*FAILED\s*\]\s*(\d+)\s*test', output) + gtest_run = re.search(r'\[==========\]\s*(\d+)\s*test.*ran', output) + + if gtest_run: + results['tests_run'] = int(gtest_run.group(1)) + + if gtest_passed: + results['tests_passed'] = int(gtest_passed.group(1)) + + if gtest_failed: + results['tests_failed'] = int(gtest_failed.group(1)) + results['failed'] = True + else: + results['passed'] = results['tests_run'] > 0 + + # Check for skipped tests + if 'SKIPPED' in output or 'Skipped' in output: + results['skipped'] = True + + # Extract error messages + error_pattern = re.compile(r'(ERROR|FAILED|TIMEOUT).*', re.MULTILINE) + errors = error_pattern.findall(output) + results['errors'] = errors[:10] # Limit to first 10 errors + + return results + diff --git a/projects/rccl/tools/scripts/test_runner/test_runner.py b/projects/rccl/tools/scripts/test_runner/test_runner.py new file mode 100755 index 0000000000..7075e2686b --- /dev/null +++ b/projects/rccl/tools/scripts/test_runner/test_runner.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +RCCL Test Runner +Main script for executing RCCL unit tests and MPI tests with hierarchical configuration +""" + +import sys +import os +import json +import logging + +from lib.test_parser import ArgumentParserInterface +from lib.test_config import TestConfigProcessor +from lib.test_executor import TestExecutor + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +def main(): + """Main entry point for test runner""" + + # Parse command-line arguments + parser_interface = ArgumentParserInterface() + args = parser_interface.process_arguments() + + # Validate config file exists + if not os.path.exists(args.config): + print(f"ERROR: Configuration file not found: {args.config}") + if args.verbose: + print("Exiting: Missing configuration file") + return + + try: + # Load and validate configuration + if args.verbose: + print("Loading configuration...") + config_processor = TestConfigProcessor(args.config) + config_processor.validate_config() + + # Create test executor + executor = TestExecutor(config_processor, args) + + # Check environment + if not executor.check_environment(): + if args.verbose: + print("Exiting: Environment check failed") + return + + # Build RCCL (if not --no-build) + if not args.no_build: + if not executor.build_rccl(): + print("ERROR: Build failed") + if args.verbose: + print("Exiting: RCCL build failed") + return + + # Parse and run test suites + if not args.skip_tests: + if args.verbose: + print("\nParsing test suites...") + test_suites = config_processor.parse_test_suites() + + if args.verbose: + print("\nCombined Test Suites (JSON):") + print(json.dumps(test_suites, indent=2)) + print() + print(f"Found {len(test_suites)} test suite(s)") + + # Print skip messages for disabled test suites upfront + print() + for suite in test_suites: + suite_name = suite["suite_details"]["name"] + enabled = suite["suite_details"].get("enabled", True) + if not enabled: + print(f"SKIP: Test suite '{suite_name}' is disabled") + + # Run only enabled test suites + all_results = [] + for suite in test_suites: + enabled = suite["suite_details"].get("enabled", True) + if enabled: + results = executor.run_test_suite(suite) + all_results.extend(results) + + # Print summary once at the end + executor.print_summary() + + # Generate coverage report + executor.generate_coverage_report() + + # Return based on results + if executor.test_results: + from lib.test_executor import TestResult + failed = executor.test_results.count(TestResult.RESULT_FAILED.value) + timeout = executor.test_results.count(TestResult.RESULT_TIMEOUT.value) + if failed > 0 or timeout > 0: + if args.verbose: + print(f"Exiting: Tests failed (failed={failed}, timeout={timeout})") + return + + if args.verbose: + print("Exiting: Test run completed successfully") + return + + except KeyboardInterrupt: + print("\n\nInterrupted by user") + if args.verbose: + print("Exiting: User interrupted execution") + return + except Exception as e: + print(f"\nERROR: {e}") + if args.verbose: + import traceback + traceback.print_exc() + print("Exiting: Unhandled exception occurred") + return + + +if __name__ == "__main__": + main() +