Merge remote-tracking branch 'nccl/master' into develop
Cette révision appartient à :
@@ -3,15 +3,53 @@
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
RCCL_HOME:=../../build/release
|
||||
HIP_HOME:=/opt/rocm
|
||||
INC:= -I$(RCCL_HOME)/include/ -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl
|
||||
PLUGIN_SO:=libnccl-tuner.so
|
||||
|
||||
default: $(PLUGIN_SO)
|
||||
.DEFAULT_GOAL: build
|
||||
PLUGIN_SO:=libnccl-tuner-example.so
|
||||
include ../../makefiles/common.mk
|
||||
SRCDIR ?= $(abspath ../..)
|
||||
BUILDDIR ?= .
|
||||
NCCLDIR := $(BUILDDIR)
|
||||
|
||||
$(PLUGIN_SO): plugin.c
|
||||
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
SRC_FILES := $(wildcard *.c)
|
||||
DST_DIR := $(BUILDDIR)/test/unit/plugins
|
||||
|
||||
default: ${BUILDDIR}/$(PLUGIN_SO)
|
||||
|
||||
build: ${BUILDDIR}/$(PLUGIN_SO)
|
||||
|
||||
${BUILDDIR}/$(PLUGIN_SO): plugin.c
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${BUILDDIR}
|
||||
$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
|
||||
# Test targets - delegate to test directory
|
||||
test:
|
||||
$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
|
||||
|
||||
test-verbose:
|
||||
$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
|
||||
|
||||
# Build tests
|
||||
test-build:
|
||||
$(MAKE) -C test all
|
||||
|
||||
# Optimize configurations from performance data
|
||||
optimize-config:
|
||||
@if [ -z "$(CSV_FILE)" ]; then \
|
||||
echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
|
||||
echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
|
||||
exit 1; \
|
||||
fi
|
||||
python3 scripts/optimize_config.py $(CSV_FILE) \
|
||||
$(if $(OUTPUT),-o $(OUTPUT)) \
|
||||
$(if $(METRIC),-m $(METRIC)) \
|
||||
$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
|
||||
$(if $(DRY_RUN),--dry-run) \
|
||||
$(if $(NO_HEADER),--no-header)
|
||||
|
||||
clean:
|
||||
rm -f $(PLUGIN_SO)
|
||||
rm -f ${BUILDDIR}/$(PLUGIN_SO)
|
||||
$(MAKE) -C test clean
|
||||
|
||||
.PHONY: test test-verbose test-build optimize-config clean
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
# NCCL Example Tuner Plugin
|
||||
|
||||
This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
|
||||
|
||||
## Features
|
||||
|
||||
- **File-based Configuration**: Read tuning parameters from a CSV configuration file
|
||||
- **Size-based Tuning**: Specify different configurations based on message size ranges
|
||||
- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
|
||||
- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
|
||||
- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
|
||||
- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
|
||||
|
||||
## Configuration File Format
|
||||
|
||||
The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
|
||||
|
||||
```
|
||||
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- **collective_type**: The collective operation type
|
||||
- `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
|
||||
|
||||
- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
|
||||
- Use `0` for minimum and `4294967295` for maximum (covers all sizes)
|
||||
|
||||
- **algorithm**: The NCCL algorithm to use
|
||||
- `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
|
||||
|
||||
- **protocol**: The NCCL protocol to use
|
||||
- `ll`, `ll128`, `simple`
|
||||
|
||||
- **channels**: Number of channels (SMs) to use
|
||||
- Use a positive integer to specify exact channel count
|
||||
- Use `-1` to keep NCCL's default channel selection
|
||||
|
||||
- **nNodes**: Number of nodes to match
|
||||
- Use a positive integer to match specific node count
|
||||
- Use `-1` to match any number of nodes
|
||||
|
||||
- **nRanks**: Number of ranks to match
|
||||
- Use a positive integer to match specific rank count
|
||||
- Use `-1` to match any number of ranks
|
||||
|
||||
- **numPipeOps**: Number of pipeline operations to match (optional)
|
||||
- Use a positive integer to match specific pipeline operation count
|
||||
- Use `-1` to match any number of pipeline operations
|
||||
- If omitted, configuration will match any numPipeOps value
|
||||
|
||||
- **regBuff**: Whether user buffer can be registered (optional)
|
||||
- Use `0` to match only non-registered buffers
|
||||
- Use `1` to match only registered buffers
|
||||
- Use `-1` to match either registered or non-registered buffers
|
||||
- If omitted, configuration will match any regBuff value
|
||||
|
||||
### Example Configuration
|
||||
|
||||
```csv
|
||||
# Single-node, small allreduce: use tree algorithm, registered buffers only
|
||||
allreduce,0,65536,tree,simple,2,1,-1,-1,1
|
||||
|
||||
# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
|
||||
allreduce,65537,1048576,ring,simple,4,4,32,1,0
|
||||
|
||||
# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
|
||||
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
|
||||
|
||||
# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
|
||||
broadcast,0,32768,tree,simple,-1,1,-1
|
||||
|
||||
# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
|
||||
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
|
||||
```
|
||||
|
||||
Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
|
||||
|
||||
### Backward Compatibility
|
||||
|
||||
Configurations without the numPipeOps and/or regBuff parameters are fully supported:
|
||||
- 8 fields: matches any numPipeOps and regBuff values
|
||||
- 9 fields: matches any regBuff value
|
||||
- 10 fields: full parameter specification
|
||||
|
||||
This ensures existing configuration files continue to work without modification.
|
||||
|
||||
## Usage
|
||||
|
||||
### Method 1: Default Config File
|
||||
Place your configuration in `nccl_tuner.conf` in the current working directory.
|
||||
|
||||
### Method 2: Environment Variable
|
||||
Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
|
||||
|
||||
```bash
|
||||
export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
|
||||
mpirun -np 4 your_nccl_application
|
||||
```
|
||||
|
||||
## Editing Configuration Files
|
||||
|
||||
### Generating Configuration Files from Raw Data
|
||||
|
||||
A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
|
||||
|
||||
### Spreadsheet Tips:
|
||||
- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
|
||||
- Save as CSV format (not Excel format) for the plugin to read
|
||||
- Use data validation to prevent typos in algorithm/protocol names
|
||||
|
||||
## Logging
|
||||
|
||||
The plugin uses NCCL's logging system. To see tuner-related messages:
|
||||
|
||||
```bash
|
||||
export NCCL_DEBUG=INFO
|
||||
```
|
||||
|
||||
This will show when configurations are loaded and applied, including the topology information.
|
||||
|
||||
For detailed debugging output during tuning decisions:
|
||||
|
||||
```bash
|
||||
export NCCL_DEBUG=TRACE
|
||||
```
|
||||
|
||||
This will show verbose information about which configurations are being evaluated and matched.
|
||||
|
||||
## Dimension Matching
|
||||
|
||||
Configurations are only applied when the topology matches:
|
||||
|
||||
- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
|
||||
- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
|
||||
- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
|
||||
- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
|
||||
|
||||
This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
|
||||
|
||||
## Default Behavior
|
||||
|
||||
If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
|
||||
|
||||
When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
1. **Config file not found**: Check the file path and permissions
|
||||
2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
|
||||
3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory and that `NCCL_TUNER_PLUGIN` either specifies the plugin name, or an absolute path to the plugin shared library.
|
||||
4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
|
||||
5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
|
||||
6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
|
||||
@@ -0,0 +1,45 @@
|
||||
# NCCL Tuner Configuration File (CSV Format)
|
||||
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
#
|
||||
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
|
||||
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
|
||||
# Protocols: ll, ll128, simple
|
||||
# Channels: number of channels to use, or -1 to keep default
|
||||
# nNodes: number of nodes to match, or -1 for any number of nodes
|
||||
# nRanks: number of ranks to match, or -1 for any number of ranks
|
||||
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
|
||||
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
|
||||
#
|
||||
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
|
||||
#
|
||||
# Examples:
|
||||
|
||||
# For single-node configurations with registered buffers
|
||||
# Small allreduce operations on single node - use tree algorithm, registered buffers
|
||||
allreduce,0,65536,tree,simple,2,1,-1,-1,1
|
||||
|
||||
# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
|
||||
# Medium allreduce operations - use ring algorithm
|
||||
allreduce,65537,1048576,ring,simple,4,4,32,1,0
|
||||
|
||||
# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
|
||||
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
|
||||
|
||||
# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
|
||||
# Single node broadcast - prefer tree, any pipeOps, registered buffers only
|
||||
broadcast,0,32768,tree,simple,-1,1,-1,-1,1
|
||||
|
||||
# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
|
||||
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
|
||||
|
||||
# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
|
||||
allgather,0,4294967295,ring,simple,4,2,-1
|
||||
|
||||
# ReduceScatter operations
|
||||
# Small messages on single node, single pipeline op, registered buffers
|
||||
reducescatter,0,131072,tree,simple,2,1,-1,1,1
|
||||
# Large messages on any topology, multiple pipeline ops, non-registered buffers
|
||||
reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
|
||||
|
||||
# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
|
||||
reduce,0,4294967295,tree,simple,-1,-1,-1
|
||||
+411
-189
@@ -5,224 +5,446 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "tuner.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
#define HOPPER_COMPCAP_IDX 2
|
||||
// NVLink, PCI, Network
|
||||
#define NCCL_HW_NVLINK 0
|
||||
#define NCCL_HW_PCI 1
|
||||
#define NCCL_HW_NET 2
|
||||
#define MAX_LINE_LENGTH 256
|
||||
|
||||
static long log2i(long n) {
|
||||
long l = 0;
|
||||
while (n>>=1) l++;
|
||||
return l;
|
||||
}
|
||||
// Latencies in us, Bandwidths in GB/s
|
||||
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
||||
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
|
||||
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring
|
||||
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain
|
||||
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
|
||||
// CSV field indices for configuration parsing
|
||||
// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
#define CONFIG_FIELD_COLLTYPE 0
|
||||
#define CONFIG_FIELD_MINBYTES 1
|
||||
#define CONFIG_FIELD_MAXBYTES 2
|
||||
#define CONFIG_FIELD_ALGORITHM 3
|
||||
#define CONFIG_FIELD_PROTOCOL 4
|
||||
#define CONFIG_FIELD_CHANNELS 5
|
||||
#define CONFIG_FIELD_NNODES 6
|
||||
#define CONFIG_FIELD_NRANKS 7
|
||||
#define CONFIG_FIELD_PIPEOPS 8 // Optional field
|
||||
#define CONFIG_FIELD_REGBUFF 9 // Optional field
|
||||
|
||||
struct tuningModel {
|
||||
float hwLat[3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float bwRatio[2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
};
|
||||
// Field count constants
|
||||
#define CONFIG_FIELDS_REQUIRED 8 // Minimum required fields (up to nRanks)
|
||||
#define CONFIG_FIELDS_WITH_PIPEOPS 9 // Fields including numPipeOps
|
||||
#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
|
||||
#define CONFIG_FIELDS_MAX 10 // Maximum number of fields supported
|
||||
|
||||
static struct tuningModel tuning_model = {
|
||||
{
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
},
|
||||
typedef struct {
|
||||
ncclFunc_t collType;
|
||||
size_t minBytes;
|
||||
size_t maxBytes;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
int nChannels;
|
||||
int nNodes;
|
||||
int nRanks;
|
||||
int numPipeOps;
|
||||
int regBuff;
|
||||
} TuningConfig;
|
||||
|
||||
{
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 0.86 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
},
|
||||
typedef struct {
|
||||
TuningConfig* configs; // Changed from static array to dynamic pointer
|
||||
int numConfigs;
|
||||
int maxConfigs; // Added to track allocated size
|
||||
size_t nRanks;
|
||||
size_t nNodes;
|
||||
ncclDebugLogger_t logFunction;
|
||||
} TunerContext;
|
||||
|
||||
{
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1, 0.4, 0.5, 1.0, 0.6, 0.4, 0.6, 0.1, 0.3, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
|
||||
{ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 0.2, 0.7, 1.0, 1.0, 1.0, 0.8, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, },
|
||||
},
|
||||
|
||||
{
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.2, 0.2, 0.1, 0.5, 0.8, 1.0, 0.2, 0.4, 0.5, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
|
||||
{ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
|
||||
},
|
||||
};
|
||||
|
||||
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
|
||||
ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int protocol, int numPipeOps, float* time, size_t nBytes) {
|
||||
float bw = bandwidths[collType][algorithm][protocol];
|
||||
float lat = latencies[collType][algorithm][protocol];
|
||||
|
||||
if (bw == 0) {
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE) {
|
||||
if (logSize < 27) bw *= tuning_model.treeCorrectionFactor[protocol][logSize];
|
||||
else bw *= tuning_model.treeCorrectionFactor[protocol][26];
|
||||
}
|
||||
else if (algorithm == NCCL_ALGO_RING) {
|
||||
if(logSize < 27) bw *= tuning_model.ringCorrectionFactor[protocol][logSize];
|
||||
else bw *= tuning_model.ringCorrectionFactor[protocol][26];
|
||||
}
|
||||
|
||||
int latCount = 1;
|
||||
*time = lat * latCount + (nBytes) / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
// Parse collective type from string
|
||||
static ncclFunc_t parseCollType(const char* str) {
|
||||
if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
|
||||
if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
|
||||
if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
|
||||
if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
|
||||
if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
|
||||
return ncclFuncAllReduce; // default
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) {
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
int compCapIndex = HOPPER_COMPCAP_IDX;
|
||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||
int index1 = nNodes == 1 ? compCapIndex : 1;
|
||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
// Convert collective type to string
|
||||
static const char* collTypeToString(ncclFunc_t collType) {
|
||||
switch (collType) {
|
||||
case ncclFuncBroadcast: return "broadcast";
|
||||
case ncclFuncReduce: return "reduce";
|
||||
case ncclFuncAllGather: return "allgather";
|
||||
case ncclFuncReduceScatter: return "reducescatter";
|
||||
case ncclFuncAllReduce: return "allreduce";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = NCCL_HW_NVLINK;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
||||
nRanks;
|
||||
int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
|
||||
nNodes;
|
||||
// Parse algorithm from string
|
||||
static int parseAlgorithm(const char* str) {
|
||||
if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
|
||||
if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
|
||||
if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
|
||||
if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
|
||||
if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
|
||||
if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
|
||||
if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
|
||||
return NCCL_ALGO_RING; // default
|
||||
}
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
// Convert algorithm to string
|
||||
static const char* algorithmToString(int algorithm) {
|
||||
switch (algorithm) {
|
||||
case NCCL_ALGO_TREE: return "tree";
|
||||
case NCCL_ALGO_RING: return "ring";
|
||||
case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
|
||||
case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
|
||||
case NCCL_ALGO_NVLS: return "nvls";
|
||||
case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
|
||||
case NCCL_ALGO_PAT: return "pat";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 1) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
|
||||
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
||||
float bw = nNodes <= 2 || collnet ? 12.0 : 12.0; //graphs[a]->bwIntra : graphs[a]->bwInter
|
||||
if (a == NCCL_ALGO_NVLS) bw = 0.0;
|
||||
if (a == NCCL_ALGO_NVLS_TREE) bw = 0.0;
|
||||
if (collnet == 1) bw = 0.0;
|
||||
int nChannels = 28; //nNodes==1 && MI300
|
||||
float busBw = nChannels * bw; //comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw
|
||||
|
||||
// Various model refinements
|
||||
if (nNodes <= 2)
|
||||
busBw *= tuning_model.bwRatio[0][a][p];
|
||||
else
|
||||
busBw *= tuning_model.bwRatio[1][a][p];
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && nNodes == 1) { busBw = busBw * 1.65; }
|
||||
// Parse protocol from string
|
||||
static int parseProtocol(const char* str) {
|
||||
if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
|
||||
if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
|
||||
if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
|
||||
return NCCL_PROTO_SIMPLE; // default
|
||||
}
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
float ratio = 1.0f;
|
||||
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
|
||||
else ratio *= .5;
|
||||
busBw *= ratio;
|
||||
}
|
||||
bandwidths[coll][a][p] = busBw;
|
||||
latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = tuning_model.hwLat[intraHw[a]][a][p];
|
||||
float interLat = tuning_model.hwLat[NCCL_HW_NET][a][p];
|
||||
// Convert protocol to string
|
||||
static const char* protocolToString(int protocol) {
|
||||
switch (protocol) {
|
||||
case NCCL_PROTO_LL: return "ll";
|
||||
case NCCL_PROTO_LL128: return "ll128";
|
||||
case NCCL_PROTO_SIMPLE: return "simple";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = tuning_model.hwLat[hw[a]][a][p];
|
||||
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
|
||||
latencies[coll][a][p] += lat;
|
||||
} else {
|
||||
// Inter-node rings still have to launch nsteps * net overhead.
|
||||
float netOverhead = 0.0;
|
||||
if (nNodes > 1) {
|
||||
netOverhead = 1;
|
||||
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
|
||||
}
|
||||
if (intraLat < netOverhead) intraLat = netOverhead;
|
||||
latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
|
||||
}
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
latencies[coll][a][p] +=
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
int minimum = 1;
|
||||
if ((nRanks/nNodes-1) < 1) minimum = (nRanks/nNodes-1);
|
||||
latencies[coll][a][p] +=
|
||||
2 * (minimum * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
|
||||
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
|
||||
latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
|
||||
} else if (a == NCCL_ALGO_NVLS) {
|
||||
if (nNodes > 1) latencies[coll][a][p] += tuning_model.hwLat[NCCL_HW_NET][a][p];
|
||||
} else if (a == NCCL_ALGO_NVLS_TREE) {
|
||||
latencies[coll][a][p] += 2*(nNodes-1)*tuning_model.hwLat[NCCL_HW_NET][a][p];
|
||||
// Helper function to count valid configuration lines in file
|
||||
static int countConfigLines(const char* filename) {
|
||||
FILE* file = fopen(filename, "r");
|
||||
if (!file) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
char line[MAX_LINE_LENGTH];
|
||||
int count = 0;
|
||||
|
||||
while (fgets(line, sizeof(line), file)) {
|
||||
// Skip comments and empty lines
|
||||
if (line[0] == '#' || line[0] == '\n') continue;
|
||||
|
||||
// Remove trailing newline
|
||||
line[strcspn(line, "\n")] = 0;
|
||||
|
||||
// Check if line has content
|
||||
if (strlen(line) > 0) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Load configuration from file
|
||||
static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
|
||||
FILE* file = fopen(filename, "r");
|
||||
if (!file) {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
|
||||
}
|
||||
return ncclSuccess; // Not finding config file is not an error
|
||||
}
|
||||
|
||||
// First pass: count valid configuration lines
|
||||
int configCount = countConfigLines(filename);
|
||||
if (configCount == 0) {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: No valid configurations found in %s", filename);
|
||||
}
|
||||
fclose(file);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Allocate memory for configurations based on actual count
|
||||
ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
|
||||
if (!ctx->configs) {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
|
||||
}
|
||||
fclose(file);
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
ctx->maxConfigs = configCount;
|
||||
ctx->numConfigs = 0;
|
||||
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
|
||||
}
|
||||
|
||||
// Reset file pointer to beginning
|
||||
fseek(file, 0, SEEK_SET);
|
||||
|
||||
char line[MAX_LINE_LENGTH];
|
||||
|
||||
while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
|
||||
// Skip comments and empty lines
|
||||
if (line[0] == '#' || line[0] == '\n') continue;
|
||||
|
||||
// Remove trailing newline
|
||||
line[strcspn(line, "\n")] = 0;
|
||||
|
||||
// Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
char* token;
|
||||
char* tokens[CONFIG_FIELDS_MAX];
|
||||
int tokenCount = 0;
|
||||
|
||||
// Make a copy of the line for tokenizing
|
||||
char lineCopy[MAX_LINE_LENGTH];
|
||||
strncpy(lineCopy, line, sizeof(lineCopy));
|
||||
lineCopy[sizeof(lineCopy) - 1] = '\0';
|
||||
|
||||
// Tokenize by comma
|
||||
token = strtok(lineCopy, ",");
|
||||
while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
|
||||
// Trim whitespace
|
||||
while (*token == ' ' || *token == '\t') token++;
|
||||
char* end = token + strlen(token) - 1;
|
||||
while (end > token && (*end == ' ' || *end == '\t')) {
|
||||
*end = '\0';
|
||||
end--;
|
||||
}
|
||||
tokens[tokenCount++] = token;
|
||||
token = strtok(NULL, ",");
|
||||
}
|
||||
|
||||
// Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
|
||||
if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
|
||||
TuningConfig* config = &ctx->configs[ctx->numConfigs];
|
||||
config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
|
||||
config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
|
||||
config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
|
||||
config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
|
||||
config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
|
||||
config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
|
||||
config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
|
||||
config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
|
||||
|
||||
// numPipeOps is optional (9th field, index 8)
|
||||
if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
|
||||
config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
|
||||
} else {
|
||||
config->numPipeOps = -1; // -1 means match any numPipeOps
|
||||
}
|
||||
|
||||
// regBuff is optional (10th field, index 9)
|
||||
if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
|
||||
config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
|
||||
} else {
|
||||
config->regBuff = -1; // -1 means match any regBuff value
|
||||
}
|
||||
|
||||
ctx->numConfigs++;
|
||||
|
||||
if (ctx->logFunction) {
|
||||
if (config->numPipeOps == -1 && config->regBuff == -1) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
|
||||
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||
config->nChannels, config->nNodes, config->nRanks);
|
||||
} else if (config->regBuff == -1) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
|
||||
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
|
||||
} else if (config->numPipeOps == -1) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
|
||||
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||
config->nChannels, config->nNodes, config->nRanks, config->regBuff);
|
||||
} else {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
|
||||
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Protocols/Algorithms enable/disable, and user overrides.
|
||||
// All are enabled except ll128 which is enabled by default only in certain cases.
|
||||
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
|
||||
|
||||
// MNNVL: NVLS not yet supported
|
||||
algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
|
||||
algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int pEnable = protoEnable[p];
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
pEnable = 0;
|
||||
}
|
||||
if (pEnable == 0) bandwidths[c][a][p] = 0;
|
||||
if (algoEnable[a] == 0) bandwidths[c][a][p] = 0;
|
||||
fclose(file);
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
|
||||
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
|
||||
if (!ctx) return ncclSystemError;
|
||||
|
||||
ctx->configs = NULL; // Initialize to NULL
|
||||
ctx->numConfigs = 0;
|
||||
ctx->maxConfigs = 0; // Initialize to 0
|
||||
ctx->nRanks = nRanks;
|
||||
ctx->nNodes = nNodes;
|
||||
ctx->logFunction = logFunction;
|
||||
|
||||
if (logFunction) {
|
||||
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
|
||||
}
|
||||
|
||||
// Try to load config file from environment variable or default location
|
||||
const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
|
||||
if (!configFile) {
|
||||
configFile = "nccl_tuner.conf"; // default config file name
|
||||
}
|
||||
|
||||
ncclResult_t result = loadConfig(ctx, configFile);
|
||||
if (result != ncclSuccess) {
|
||||
if (ctx->configs) {
|
||||
free(ctx->configs); // Clean up allocated memory on error
|
||||
}
|
||||
free(ctx);
|
||||
return result;
|
||||
}
|
||||
|
||||
*context = ctx;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels) {
|
||||
|
||||
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
|
||||
// Find algorithm / protocol.
|
||||
*algorithm = -1;
|
||||
*protocol = -1;
|
||||
int nAlgos = NCCL_NUM_ALGORITHMS;
|
||||
for (int a=0; a<nAlgos; a++) {
|
||||
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
|
||||
if (a == NCCL_ALGO_NVLS && collNetSupport != 1) continue;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (p == NCCL_PROTO_LL128) continue;
|
||||
float time;
|
||||
ncclTopoGetAlgoTime_Tuner(collType, a, p, numPipeOps, &time, nBytes);
|
||||
if (time >= 0 && time < minTime) {
|
||||
*algorithm = a;
|
||||
*protocol = p;
|
||||
minTime = time;
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int regBuff, int* nChannels) {
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
if (!ctx) return ncclInternalError;
|
||||
|
||||
// Default channels
|
||||
*nChannels = 1;
|
||||
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
|
||||
collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
|
||||
}
|
||||
|
||||
// Cast the collCostTable pointer to a 2D array to fix the segmentation fault
|
||||
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
||||
|
||||
// Look for matching configuration
|
||||
for (int i = 0; i < ctx->numConfigs; i++) {
|
||||
TuningConfig* config = &ctx->configs[i];
|
||||
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
|
||||
i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||
config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
|
||||
}
|
||||
|
||||
// Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
|
||||
if (config->collType == collType &&
|
||||
nBytes >= config->minBytes &&
|
||||
nBytes <= config->maxBytes &&
|
||||
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
|
||||
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
|
||||
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
|
||||
(config->regBuff == -1 || config->regBuff == regBuff)) {
|
||||
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
|
||||
algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
|
||||
}
|
||||
|
||||
// Check bounds
|
||||
if (config->algorithm < numAlgo && config->protocol < numProto) {
|
||||
if (table[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
|
||||
algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||
&table[config->algorithm][config->protocol], table[config->algorithm][config->protocol]);
|
||||
}
|
||||
table[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
|
||||
|
||||
// Only override channels if not set to -1 (keep default)
|
||||
if (config->nChannels != -1) {
|
||||
*nChannels = config->nChannels;
|
||||
}
|
||||
|
||||
if (ctx->logFunction) {
|
||||
if (config->nChannels == -1) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
|
||||
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||
config->nNodes, config->nRanks);
|
||||
} else {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
|
||||
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||
config->nChannels, config->nNodes, config->nRanks);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
|
||||
algorithmToString(config->algorithm), protocolToString(config->protocol));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
|
||||
algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
|
||||
config->collType == collType,
|
||||
(nBytes >= config->minBytes && nBytes <= config->maxBytes),
|
||||
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
|
||||
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
|
||||
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
|
||||
(config->regBuff == -1 || config->regBuff == regBuff));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no specific config found, apply default behavior
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: No matching config found");
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginDestroy(void* context) {
|
||||
if (context) {
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
if (ctx->configs) {
|
||||
free(ctx->configs); // Free dynamically allocated configs array
|
||||
}
|
||||
free(context);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
# NCCL Tuner Configuration Scripts
|
||||
|
||||
This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
|
||||
|
||||
## optimize_config.py
|
||||
|
||||
A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
python scripts/optimize_config.py [options] <input_csv_file>
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
|
||||
- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
|
||||
- `--no-header`: Don't add header comments to output file
|
||||
- `--dry-run`: Print configurations without writing to file
|
||||
|
||||
### CSV Input Format
|
||||
|
||||
The input CSV file should have the following columns:
|
||||
|
||||
```csv
|
||||
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
|
||||
```
|
||||
|
||||
**Required columns:**
|
||||
- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
|
||||
- `size_bytes`: Message size in bytes
|
||||
- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
|
||||
- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
|
||||
- `channels`: Number of channels (or `-1` for default)
|
||||
- `nodes`: Number of nodes (or `-1` for any)
|
||||
- `ranks`: Number of ranks (or `-1` for any)
|
||||
- `pipeOps`: Number of pipeline operations (or `-1` for any)
|
||||
- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
|
||||
|
||||
**Optional metrics (must have at least one present):**
|
||||
- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
|
||||
- `latency_us`: Latency in microseconds (lower is better)
|
||||
|
||||
### Examples
|
||||
|
||||
**Basic usage with cost optimization:**
|
||||
```bash
|
||||
python scripts/optimize_config.py sample_performance_data.csv
|
||||
```
|
||||
|
||||
**Optimize for bandwidth and write to custom file:**
|
||||
```bash
|
||||
python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
|
||||
```
|
||||
|
||||
**Preview configurations without writing:**
|
||||
```bash
|
||||
python scripts/optimize_config.py --dry-run performance_data.csv
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Data Loading**: Reads CSV performance data and validates format
|
||||
2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
|
||||
3. **Size Ranges**: Automatically bins data into size ranges for optimization
|
||||
4. **Optimization**: Finds the best performing configuration for each group/size combination
|
||||
5. **Output**: Generates NCCL tuner config format and appends to specified file
|
||||
|
||||
### Default Size Ranges
|
||||
|
||||
The script uses these default size ranges (in bytes):
|
||||
- Small: 0 - 1,024
|
||||
- Medium: 1,025 - 65,536
|
||||
- Large: 65,537 - 1,048,576
|
||||
- XLarge: 1,048,577 - 16,777,216
|
||||
- XXLarge: 16,777,217 - 4,294,967,295
|
||||
|
||||
### Sample Data
|
||||
|
||||
See `sample_performance_data.csv` for an example of the expected input format.
|
||||
|
||||
### Integration with NCCL
|
||||
|
||||
The generated configuration file can be used directly with the NCCL tuner plugin:
|
||||
|
||||
```bash
|
||||
export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
|
||||
export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
|
||||
mpirun -np 8 your_nccl_application
|
||||
```
|
||||
|
||||
### Performance Data Collection
|
||||
|
||||
To collect performance data for optimization, you can:
|
||||
|
||||
1. **Use NCCL benchmarks** with different algorithm/protocol combinations
|
||||
2. **Profile your applications** with various tuner settings
|
||||
3. **Run systematic sweeps** across parameter combinations
|
||||
4. **Use NCCL debug output** to collect timing information
|
||||
|
||||
The key is to have comprehensive data covering:
|
||||
- Different message sizes (small to large)
|
||||
- Various topologies (single node, multi-node)
|
||||
- All relevant algorithm/protocol combinations
|
||||
- Different channel counts and pipeline configurations
|
||||
@@ -0,0 +1,430 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
NCCL Tuner Configuration Optimizer
|
||||
|
||||
Reads a CSV file containing performance data across different tuning parameters
|
||||
and generates optimal NCCL tuner configurations based on the best performing
|
||||
combinations.
|
||||
|
||||
By default, creates growing size ranges that interpolate between the actual data sizes
|
||||
for each unique dimension (node count, rank count combination). This ensures that
|
||||
different cluster configurations get their own optimized size boundaries, as
|
||||
performance characteristics often vary significantly between topologies.
|
||||
|
||||
Each dimension gets its own set of ranges starting from 0 and extending to the maximum
|
||||
size for that dimension, with boundaries at midpoints between consecutive data sizes.
|
||||
|
||||
CSV Input Format:
|
||||
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
|
||||
|
||||
Output Format (NCCL Tuner Config):
|
||||
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
|
||||
Usage Examples:
|
||||
# Auto-create dimension-specific interpolated ranges (default)
|
||||
python3 optimize_config.py data.csv
|
||||
|
||||
# Use custom size ranges (applied to all topologies)
|
||||
python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
|
||||
|
||||
# Use hardcoded default ranges (applied to all topologies)
|
||||
python3 optimize_config.py data.csv --no-auto-ranges
|
||||
"""
|
||||
|
||||
import csv
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Tuple, Any
|
||||
|
||||
class PerformanceData:
|
||||
def __init__(self, row: Dict[str, str]):
|
||||
self.collective = row['collective']
|
||||
self.size_bytes = int(row['size_bytes'])
|
||||
self.algorithm = row['algorithm']
|
||||
self.protocol = row['protocol']
|
||||
self.channels = int(row['channels']) if row['channels'] != '-1' else -1
|
||||
self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
|
||||
self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
|
||||
self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
|
||||
self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
|
||||
|
||||
# Performance metrics
|
||||
self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better
|
||||
self.latency_us = float(row.get('latency_us', 0)) # Lower is better
|
||||
|
||||
def get_config_key(self) -> Tuple:
|
||||
"""Generate a key for grouping similar configurations"""
|
||||
return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
|
||||
|
||||
def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
|
||||
"""Find which size range this data point belongs to for its dimension"""
|
||||
topology_key = (self.nodes, self.ranks)
|
||||
|
||||
# Get size ranges for this dimension, or fall back to default
|
||||
if topology_key in topology_size_ranges:
|
||||
size_ranges = topology_size_ranges[topology_key]
|
||||
elif (-1, -1) in topology_size_ranges:
|
||||
size_ranges = topology_size_ranges[(-1, -1)]
|
||||
else:
|
||||
# Fallback to first available dimension ranges
|
||||
size_ranges = next(iter(topology_size_ranges.values()))
|
||||
|
||||
for min_size, max_size in size_ranges:
|
||||
if min_size <= self.size_bytes <= max_size:
|
||||
return (min_size, max_size)
|
||||
# If no range found, create a single-point range
|
||||
return (self.size_bytes, self.size_bytes)
|
||||
|
||||
class ConfigOptimizer:
|
||||
def __init__(self, optimization_metric: str = 'latency_us'):
|
||||
self.optimization_metric = optimization_metric
|
||||
# Default size ranges - will be overridden by auto-detection
|
||||
self.size_ranges = [
|
||||
(0, 1024),
|
||||
(1025, 64*1024),
|
||||
(64*1024+1, 1024*1024),
|
||||
(1024*1024+1, 16*1024*1024),
|
||||
(16*1024*1024+1, 4*1024*1024*1024-1)
|
||||
]
|
||||
self.auto_size_ranges = True
|
||||
|
||||
def set_size_ranges(self, ranges: List[Tuple[int, int]]):
|
||||
"""Set custom size ranges for optimization"""
|
||||
self.size_ranges = ranges
|
||||
self.auto_size_ranges = False
|
||||
|
||||
def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
|
||||
"""Create growing size ranges for each unique (nodes, ranks) dimension"""
|
||||
if not data:
|
||||
return {(-1, -1): self.size_ranges}
|
||||
|
||||
# Group data by dimension (nodes, ranks)
|
||||
topology_data = defaultdict(list)
|
||||
for item in data:
|
||||
topology_key = (item.nodes, item.ranks)
|
||||
topology_data[topology_key].append(item)
|
||||
|
||||
topology_ranges = {}
|
||||
|
||||
for topology_key, items in topology_data.items():
|
||||
nodes, ranks = topology_key
|
||||
|
||||
# Extract unique sizes for this dimension and sort them
|
||||
unique_sizes = sorted(set(item.size_bytes for item in items))
|
||||
|
||||
if len(unique_sizes) <= 1:
|
||||
# Only one size, create a single range from 0 to that size
|
||||
size = unique_sizes[0] if unique_sizes else 0
|
||||
ranges = [(0, size)]
|
||||
else:
|
||||
# Create growing ranges that interpolate between data points
|
||||
ranges = []
|
||||
|
||||
for i, size in enumerate(unique_sizes):
|
||||
if i == 0:
|
||||
# First range: 0 to midpoint between first and second size
|
||||
if len(unique_sizes) > 1:
|
||||
next_size = unique_sizes[i + 1]
|
||||
max_size = (size + next_size) // 2
|
||||
else:
|
||||
max_size = size
|
||||
min_size = 0
|
||||
elif i == len(unique_sizes) - 1:
|
||||
# Last range: previous max + 1 to current size (and beyond)
|
||||
min_size = ranges[-1][1] + 1
|
||||
max_size = size
|
||||
else:
|
||||
# Intermediate ranges: previous max + 1 to midpoint with next size
|
||||
min_size = ranges[-1][1] + 1
|
||||
next_size = unique_sizes[i + 1]
|
||||
max_size = (size + next_size) // 2
|
||||
|
||||
ranges.append((min_size, max_size))
|
||||
|
||||
topology_ranges[topology_key] = ranges
|
||||
|
||||
print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
|
||||
for i, (min_size, max_size) in enumerate(ranges):
|
||||
# Count data points that fall in this range for this dimension
|
||||
count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
|
||||
actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
|
||||
if actual_sizes:
|
||||
size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
|
||||
if len(actual_sizes) > 3:
|
||||
size_list += f", ... (+{len(actual_sizes)-3} more)"
|
||||
print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
|
||||
|
||||
return topology_ranges
|
||||
|
||||
def load_data(self, csv_file: str) -> List[PerformanceData]:
|
||||
"""Load performance data from CSV file"""
|
||||
data = []
|
||||
try:
|
||||
with open(csv_file, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
try:
|
||||
data.append(PerformanceData(row))
|
||||
except (ValueError, KeyError) as e:
|
||||
print(f"Warning: Skipping invalid row: {row} - {e}")
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File {csv_file} not found")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading {csv_file}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loaded {len(data)} performance data points")
|
||||
|
||||
# Auto-determine size ranges if enabled
|
||||
if self.auto_size_ranges and data:
|
||||
self.topology_size_ranges = self.auto_determine_size_ranges(data)
|
||||
else:
|
||||
# Use default ranges for all topologies
|
||||
self.topology_size_ranges = {(-1, -1): self.size_ranges}
|
||||
|
||||
return data
|
||||
|
||||
def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
|
||||
"""Determine if new_data is better than current_best"""
|
||||
if self.optimization_metric == 'bandwidth_gbps':
|
||||
return new_data.bandwidth_gbps > current_best.bandwidth_gbps
|
||||
elif self.optimization_metric == 'latency_us':
|
||||
return new_data.latency_us < current_best.latency_us
|
||||
else:
|
||||
# Default to latency
|
||||
return new_data.latency_us < current_best.latency_us
|
||||
|
||||
def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
|
||||
"""Find optimal configurations and return as NCCL config strings"""
|
||||
# Group data by configuration key and size range
|
||||
grouped_data = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for item in data:
|
||||
config_key = item.get_config_key()
|
||||
size_range = item.get_size_range_key(self.topology_size_ranges)
|
||||
grouped_data[config_key][size_range].append(item)
|
||||
|
||||
# Store optimal configurations before combining ranges
|
||||
optimal_configs = []
|
||||
|
||||
for config_key, size_ranges_dict in grouped_data.items():
|
||||
collective, nodes, ranks, pipeOps, regBuff = config_key
|
||||
|
||||
for (min_size, max_size), items in size_ranges_dict.items():
|
||||
if not items:
|
||||
continue
|
||||
|
||||
# Find the best performing configuration for this size range
|
||||
best_item = items[0]
|
||||
for item in items[1:]:
|
||||
if self.is_better(item, best_item):
|
||||
best_item = item
|
||||
|
||||
# Store the optimal configuration with its range
|
||||
optimal_configs.append({
|
||||
'collective': collective,
|
||||
'min_size': min_size,
|
||||
'max_size': max_size,
|
||||
'algorithm': best_item.algorithm,
|
||||
'protocol': best_item.protocol,
|
||||
'channels': best_item.channels,
|
||||
'nodes': best_item.nodes,
|
||||
'ranks': best_item.ranks,
|
||||
'pipeOps': best_item.pipeOps,
|
||||
'regBuff': best_item.regBuff,
|
||||
'metric_value': getattr(best_item, self.optimization_metric)
|
||||
})
|
||||
|
||||
# Combine sequential ranges with identical tunings
|
||||
combined_configs = self.combine_sequential_ranges(optimal_configs)
|
||||
|
||||
# Generate config strings
|
||||
configs = []
|
||||
for config in combined_configs:
|
||||
config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
|
||||
configs.append(config_str)
|
||||
|
||||
print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
|
||||
f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
|
||||
f"({self.optimization_metric}={config['metric_value']:.3f})")
|
||||
|
||||
return configs
|
||||
|
||||
def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
|
||||
"""Combine sequential ranges that have identical tuning parameters"""
|
||||
if not configs:
|
||||
return configs
|
||||
|
||||
# Group by collective and topology (nodes, ranks)
|
||||
topology_groups = defaultdict(list)
|
||||
for config in configs:
|
||||
topology_key = (config['collective'], config['nodes'], config['ranks'],
|
||||
config['pipeOps'], config['regBuff'])
|
||||
topology_groups[topology_key].append(config)
|
||||
|
||||
combined_configs = []
|
||||
|
||||
for topology_key, topology_configs in topology_groups.items():
|
||||
# Sort by min_size to ensure proper ordering
|
||||
topology_configs.sort(key=lambda x: x['min_size'])
|
||||
|
||||
# Group by tuning parameters (algorithm, protocol, channels)
|
||||
tuning_groups = defaultdict(list)
|
||||
for config in topology_configs:
|
||||
tuning_key = (config['algorithm'], config['protocol'], config['channels'])
|
||||
tuning_groups[tuning_key].append(config)
|
||||
|
||||
# For each tuning group, combine sequential ranges
|
||||
for tuning_key, tuning_configs in tuning_groups.items():
|
||||
if not tuning_configs:
|
||||
continue
|
||||
|
||||
# Sort by min_size
|
||||
tuning_configs.sort(key=lambda x: x['min_size'])
|
||||
|
||||
# Combine sequential ranges
|
||||
current_config = tuning_configs[0].copy()
|
||||
|
||||
for next_config in tuning_configs[1:]:
|
||||
# Check if ranges are adjacent or overlapping
|
||||
if current_config['max_size'] + 1 >= next_config['min_size']:
|
||||
# Extend the current range
|
||||
current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
|
||||
# Update metric value to the better one
|
||||
if self.optimization_metric == 'bandwidth_gbps':
|
||||
if next_config['metric_value'] > current_config['metric_value']:
|
||||
current_config['metric_value'] = next_config['metric_value']
|
||||
else: # latency_us or default
|
||||
if next_config['metric_value'] < current_config['metric_value']:
|
||||
current_config['metric_value'] = next_config['metric_value']
|
||||
else:
|
||||
# Gap between ranges, save current and start new one
|
||||
combined_configs.append(current_config)
|
||||
current_config = next_config.copy()
|
||||
|
||||
# Add the last configuration
|
||||
combined_configs.append(current_config)
|
||||
|
||||
# Sort final configs by collective, nodes, ranks, then min_size
|
||||
combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
|
||||
|
||||
original_count = len(configs)
|
||||
combined_count = len(combined_configs)
|
||||
if combined_count < original_count:
|
||||
print(f"Combined {original_count} ranges into {combined_count} ranges "
|
||||
f"(reduced by {original_count - combined_count})")
|
||||
|
||||
return combined_configs
|
||||
|
||||
def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
|
||||
"""Append optimized configurations to NCCL tuner config file"""
|
||||
try:
|
||||
# Create directory if it doesn't exist
|
||||
config_dir = os.path.dirname(config_file)
|
||||
if config_dir and not os.path.exists(config_dir):
|
||||
os.makedirs(config_dir)
|
||||
print(f"Created directory: {config_dir}")
|
||||
|
||||
# Check if file exists and has content
|
||||
file_exists = os.path.exists(config_file)
|
||||
add_separator = False
|
||||
|
||||
if file_exists:
|
||||
with open(config_file, 'r') as f:
|
||||
content = f.read().strip()
|
||||
add_separator = len(content) > 0
|
||||
print(f"Appending to existing file: {config_file}")
|
||||
else:
|
||||
print(f"Creating new file: {config_file}")
|
||||
|
||||
with open(config_file, 'a') as f:
|
||||
if add_separator:
|
||||
f.write("\n\n")
|
||||
|
||||
if add_header:
|
||||
f.write(f"# Optimized configurations generated by optimize_config.py\n")
|
||||
f.write(f"# Optimization metric: {self.optimization_metric}\n")
|
||||
f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
|
||||
|
||||
for config in configs:
|
||||
f.write(f"{config}\n")
|
||||
|
||||
if file_exists:
|
||||
print(f"Appended {len(configs)} optimized configurations to {config_file}")
|
||||
else:
|
||||
print(f"Created {config_file} with {len(configs)} optimized configurations")
|
||||
|
||||
except PermissionError:
|
||||
print(f"Error: Permission denied writing to {config_file}")
|
||||
print("Try running with appropriate permissions or choose a different output location")
|
||||
sys.exit(1)
|
||||
except OSError as e:
|
||||
print(f"Error: Cannot create/write to {config_file}: {e}")
|
||||
print("Check that the path is valid and you have write permissions")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error writing to {config_file}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
|
||||
parser.add_argument("csv_file", help="Input CSV file with performance data")
|
||||
parser.add_argument("-o", "--output", default="nccl_tuner.conf",
|
||||
help="Output NCCL tuner config file (default: nccl_tuner.conf)")
|
||||
parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
|
||||
default='latency_us', help="Optimization metric (default: latency_us)")
|
||||
parser.add_argument("--no-header", action="store_true",
|
||||
help="Don't add header comments to output file")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print configurations without writing to file")
|
||||
parser.add_argument("--no-auto-ranges", action="store_true",
|
||||
help="Disable automatic size range determination (use default ranges)")
|
||||
parser.add_argument("--size-ranges", type=str,
|
||||
help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
optimizer = ConfigOptimizer(args.metric)
|
||||
|
||||
# Handle size range configuration
|
||||
if args.size_ranges:
|
||||
# Parse custom size ranges
|
||||
try:
|
||||
ranges = []
|
||||
for range_str in args.size_ranges.split(','):
|
||||
min_size, max_size = map(int, range_str.split('-'))
|
||||
ranges.append((min_size, max_size))
|
||||
optimizer.set_size_ranges(ranges)
|
||||
print(f"Using custom size ranges: {ranges}")
|
||||
except ValueError:
|
||||
print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
|
||||
sys.exit(1)
|
||||
elif args.no_auto_ranges:
|
||||
# Disable auto-ranging
|
||||
optimizer.auto_size_ranges = False
|
||||
print("Using default hardcoded size ranges")
|
||||
else:
|
||||
# Auto-ranging is enabled by default - creates one bucket per unique size
|
||||
optimizer.auto_size_ranges = True
|
||||
print("Auto-ranging enabled: will create one bucket per unique size in data")
|
||||
|
||||
# Load and optimize data
|
||||
data = optimizer.load_data(args.csv_file)
|
||||
if not data:
|
||||
print("No valid data found in CSV file")
|
||||
sys.exit(1)
|
||||
|
||||
configs = optimizer.optimize_configurations(data)
|
||||
|
||||
if args.dry_run:
|
||||
print("\nGenerated configurations:")
|
||||
for config in configs:
|
||||
print(config)
|
||||
else:
|
||||
optimizer.append_to_config_file(configs, args.output, not args.no_header)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,24 @@
|
||||
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
|
||||
allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
|
||||
allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
|
||||
allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
|
||||
allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
|
||||
allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
|
||||
allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
|
||||
allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
|
||||
allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
|
||||
allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
|
||||
allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
|
||||
broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
|
||||
broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
|
||||
broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
|
||||
broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
|
||||
broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
|
||||
broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
|
||||
allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
|
||||
allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
|
||||
allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
|
||||
allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
|
||||
allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
|
||||
allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
|
||||
allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
|
||||
|
@@ -0,0 +1,30 @@
|
||||
#
|
||||
# Makefile for NCCL Tuner Plugin Unit Tests
|
||||
#
|
||||
|
||||
CC := gcc
|
||||
CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
|
||||
INC := -I. -I../nccl
|
||||
TARGET := test_plugin
|
||||
SOURCES := test_plugin.c
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Build the test executable
|
||||
$(TARGET): $(SOURCES)
|
||||
$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
|
||||
|
||||
# Run the tests
|
||||
test: $(TARGET)
|
||||
./$(TARGET) $(TEST_CASE)
|
||||
|
||||
# Run tests with verbose output
|
||||
test-verbose: $(TARGET)
|
||||
NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
|
||||
|
||||
# Clean build artifacts
|
||||
clean:
|
||||
rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
|
||||
|
||||
.PHONY: all test test-verbose clean
|
||||
@@ -0,0 +1,205 @@
|
||||
# NCCL Tuner Plugin Unit Tests
|
||||
|
||||
This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
|
||||
|
||||
## Test Structure
|
||||
|
||||
```
|
||||
test/
|
||||
├── test_plugin.c # Main unit test file
|
||||
├── Makefile # Build system for tests
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
## Building and Running Tests
|
||||
|
||||
### Quick Start
|
||||
|
||||
```bash
|
||||
# Build and run all tests
|
||||
make test
|
||||
|
||||
# Or step by step
|
||||
make # Build test executable
|
||||
./test_plugin # Run tests
|
||||
```
|
||||
|
||||
### Advanced Testing
|
||||
|
||||
```bash
|
||||
# Run with memory leak detection (requires valgrind)
|
||||
make test-memory
|
||||
|
||||
# Run with verbose logging
|
||||
make test-verbose
|
||||
|
||||
# Generate code coverage report (requires gcov)
|
||||
make coverage
|
||||
|
||||
# Create sample test configuration files
|
||||
make test-configs
|
||||
```
|
||||
|
||||
## Test Coverage
|
||||
|
||||
The unit tests cover the following functionality:
|
||||
|
||||
### 1. **Plugin Initialization (`test_plugin_init`)**
|
||||
- Tests successful plugin initialization
|
||||
- Verifies context allocation
|
||||
- Tests cleanup on destroy
|
||||
|
||||
### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
|
||||
- Valid CSV format parsing
|
||||
- Comment and empty line handling
|
||||
- Invalid format graceful handling
|
||||
- Environment variable configuration
|
||||
|
||||
### 3. **Collective Type Matching (`test_collective_matching`)**
|
||||
- Correct matching of allreduce, broadcast, etc.
|
||||
- Algorithm/protocol selection
|
||||
- Channel configuration
|
||||
|
||||
### 4. **Size Range Matching (`test_size_matching`)**
|
||||
- Small, medium, large message size handling
|
||||
- Proper range boundary checking
|
||||
- Multiple size-based configurations
|
||||
|
||||
### 5. **Topology Matching (`test_topology_matching`)**
|
||||
- Single-node vs multi-node configurations
|
||||
- Exact nNodes/nRanks matching
|
||||
- Wildcard matching (-1 values)
|
||||
|
||||
### 6. **Default Channels (`test_default_channels`)**
|
||||
- Proper handling of -1 channel specification
|
||||
- Preservation of NCCL default behavior
|
||||
|
||||
### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
|
||||
- Configurations based on regBuff parameter
|
||||
- Registered vs non-registered buffer handling
|
||||
- Backward compatibility with configs missing regBuff
|
||||
|
||||
### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
|
||||
- Configurations based on numPipeOps parameter
|
||||
- Single vs multiple pipeline operation handling
|
||||
- Backward compatibility with configs missing numPipeOps
|
||||
|
||||
### 9. **Fallback Behavior (`test_no_match_fallback`)**
|
||||
- Default behavior when no config matches
|
||||
- Ring/Simple algorithm fallback
|
||||
|
||||
## Test Output
|
||||
|
||||
Successful test run:
|
||||
```
|
||||
Running NCCL Tuner Plugin Unit Tests
|
||||
=====================================
|
||||
PASS: test_plugin_init
|
||||
PASS: test_config_parsing_valid
|
||||
PASS: test_config_parsing_invalid
|
||||
PASS: test_collective_matching
|
||||
PASS: test_size_matching
|
||||
PASS: test_topology_matching
|
||||
PASS: test_default_channels
|
||||
PASS: test_regbuff_matching
|
||||
PASS: test_pipeops_matching
|
||||
PASS: test_no_match_fallback
|
||||
|
||||
=====================================
|
||||
Test Results: 9/9 tests passed
|
||||
All tests PASSED!
|
||||
```
|
||||
|
||||
Failed test example:
|
||||
```
|
||||
FAIL: test_collective_matching - Tree/Simple should have low cost
|
||||
Test Results: 8/9 tests passed
|
||||
Some tests FAILED!
|
||||
```
|
||||
|
||||
## Mock NCCL Implementation
|
||||
|
||||
The tests use the actual NCCL header files from the `../nccl/` directory:
|
||||
|
||||
- `tuner.h` - Complete NCCL tuner interface and type definitions
|
||||
- `common.h` - Common NCCL types and logging functions
|
||||
- `err.h` - NCCL error codes
|
||||
|
||||
This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
|
||||
|
||||
## Integration with CI/CD
|
||||
|
||||
```bash
|
||||
# Install tests for CI/CD pipeline
|
||||
make install-test
|
||||
|
||||
# Run as part of automated testing
|
||||
make test && echo "Tests passed" || echo "Tests failed"
|
||||
```
|
||||
|
||||
## Memory Testing
|
||||
|
||||
The tests can be run with valgrind for memory leak detection:
|
||||
|
||||
```bash
|
||||
make test-memory
|
||||
```
|
||||
|
||||
This will detect:
|
||||
- Memory leaks
|
||||
- Invalid memory access
|
||||
- Use of uninitialized memory
|
||||
|
||||
## Code Coverage
|
||||
|
||||
Generate code coverage reports to ensure comprehensive testing:
|
||||
|
||||
```bash
|
||||
make coverage
|
||||
# Creates test_plugin.c.gcov with line-by-line coverage
|
||||
```
|
||||
|
||||
## Adding New Tests
|
||||
|
||||
To add a new test:
|
||||
|
||||
1. Create a new test function in `test_plugin.c`:
|
||||
```c
|
||||
int test_new_feature() {
|
||||
// Test setup
|
||||
TEST_ASSERT(condition, "description");
|
||||
// Test cleanup
|
||||
TEST_PASS();
|
||||
}
|
||||
```
|
||||
|
||||
2. Add the test to the main function:
|
||||
```c
|
||||
total++; passed += test_new_feature();
|
||||
```
|
||||
|
||||
3. Rebuild and run:
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
## Debugging Tests
|
||||
|
||||
For debugging failed tests:
|
||||
|
||||
```bash
|
||||
# Compile with debug symbols
|
||||
make CFLAGS="-g -O0 -DDEBUG"
|
||||
|
||||
# Run with gdb
|
||||
gdb ./test_plugin
|
||||
```
|
||||
|
||||
## Cleaning Up
|
||||
|
||||
```bash
|
||||
# Remove all build artifacts and temporary files
|
||||
make clean
|
||||
```
|
||||
|
||||
This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
|
||||
@@ -0,0 +1,856 @@
|
||||
/*************************************************************************
|
||||
* Unit tests for NCCL Tuner Plugin
|
||||
************************************************************************/
|
||||
|
||||
#define _GNU_SOURCE // Enable setenv/unsetenv and other GNU extensions
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
|
||||
// Include NCCL tuner header (which includes common.h and err.h)
|
||||
#include "tuner.h"
|
||||
|
||||
// Include plugin source for testing
|
||||
#include "../plugin.c"
|
||||
|
||||
// Test framework macros
|
||||
#define TEST_ASSERT(condition, message) \
|
||||
do { \
|
||||
if (!(condition)) { \
|
||||
printf("FAIL: %s - %s\n", __func__, message); \
|
||||
return 0; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define TEST_PASS() \
|
||||
do { \
|
||||
printf("PASS: %s\n", __func__); \
|
||||
return 1; \
|
||||
} while(0)
|
||||
|
||||
// Global test state
|
||||
static int test_log_count = 0;
|
||||
|
||||
// Mock logger function
|
||||
void mock_logger(ncclDebugLogLevel level, unsigned long flags,
|
||||
const char* file, int line, const char* fmt, ...) {
|
||||
(void)flags; // Suppress unused parameter warning
|
||||
test_log_count++;
|
||||
|
||||
// Check if we should print based on NCCL_DEBUG level
|
||||
const char* debug_level = getenv("NCCL_DEBUG");
|
||||
int should_print = 0;
|
||||
|
||||
if (debug_level) {
|
||||
if (strcmp(debug_level, "TRACE") == 0) {
|
||||
should_print = 1; // Print everything
|
||||
} else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
|
||||
should_print = 1; // Print INFO and below
|
||||
} else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
|
||||
should_print = 1; // Print WARN and below
|
||||
}
|
||||
}
|
||||
|
||||
if (!should_print) return;
|
||||
|
||||
// Convert log level to string
|
||||
const char* level_str;
|
||||
switch(level) {
|
||||
case NCCL_LOG_NONE: level_str = "NONE"; break;
|
||||
case NCCL_LOG_VERSION: level_str = "VERSION"; break;
|
||||
case NCCL_LOG_WARN: level_str = "WARN"; break;
|
||||
case NCCL_LOG_INFO: level_str = "INFO"; break;
|
||||
case NCCL_LOG_ABORT: level_str = "ABORT"; break;
|
||||
case NCCL_LOG_TRACE: level_str = "TRACE"; break;
|
||||
default: level_str = "UNKNOWN"; break;
|
||||
}
|
||||
|
||||
// Print log header
|
||||
printf("[TUNER:%s:%s:%d] ", level_str, file, line);
|
||||
|
||||
// Print formatted message
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vprintf(fmt, args);
|
||||
va_end(args);
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Helper function to create test config file
|
||||
void create_test_config(const char* filename, const char* content) {
|
||||
FILE* f = fopen(filename, "w");
|
||||
if (f) {
|
||||
fprintf(f, "%s", content);
|
||||
fclose(f);
|
||||
}
|
||||
}
|
||||
|
||||
// Test 1: Plugin initialization
|
||||
int test_plugin_init() {
|
||||
void* context = NULL;
|
||||
|
||||
// Test successful initialization
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
|
||||
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 2: Configuration file parsing - valid CSV
|
||||
int test_config_parsing_valid() {
|
||||
const char* test_config =
|
||||
"# Test configuration\n"
|
||||
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
|
||||
"broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
|
||||
"# Comment line\n"
|
||||
"\n" // Empty line
|
||||
"reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
|
||||
|
||||
create_test_config("test_valid.conf", test_config);
|
||||
|
||||
// Set environment variable to use our test config
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_valid.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 3: Configuration file parsing - invalid CSV
|
||||
int test_config_parsing_invalid() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,2,1 # Missing nRanks and other fields\n"
|
||||
"invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
|
||||
"broadcast,abc,def,ring,simple,1,1,1,-1,-1\n"; // Invalid numbers
|
||||
|
||||
create_test_config("test_invalid.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
|
||||
// Should still succeed but with no valid configs loaded
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_invalid.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 4: Collective type matching
|
||||
int test_collective_matching() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
|
||||
"broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
|
||||
|
||||
create_test_config("test_match.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
// Create mock cost table
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0; // Default high cost
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
|
||||
// Test allreduce matching (should match first config)
|
||||
ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 8, "Should set 8 channels");
|
||||
|
||||
// Test broadcast matching (should match second config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0; // Reset costs
|
||||
}
|
||||
}
|
||||
|
||||
result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
|
||||
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_match.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 5: Size range matching
|
||||
int test_size_matching() {
|
||||
const char* test_config =
|
||||
"allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
|
||||
"allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
|
||||
"allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
|
||||
|
||||
create_test_config("test_size.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
int nChannels = 1;
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
|
||||
|
||||
// Test medium message (should match second config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
|
||||
|
||||
// Test large message (should match third config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
|
||||
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_size.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 6: Topology matching
|
||||
int test_topology_matching() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" // Single node only
|
||||
"allreduce,0,65536,ring,simple,4,4,32,-1,-1\n" // 4 nodes, 32 ranks exactly
|
||||
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any topology
|
||||
|
||||
create_test_config("test_topo.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
|
||||
|
||||
// Test with single node setup
|
||||
void* context1 = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
|
||||
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
|
||||
|
||||
pluginDestroy(context1);
|
||||
|
||||
// Test with 4 nodes, 32 ranks setup
|
||||
void* context2 = NULL;
|
||||
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
|
||||
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
|
||||
TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
|
||||
|
||||
// Clean up
|
||||
unlink("test_topo.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 7: Default channels behavior (-1)
|
||||
int test_default_channels() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n"; // Use default channels
|
||||
|
||||
create_test_config("test_default.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels = 99; // Set to known value
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
|
||||
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_default.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 8: regBuff matching
|
||||
int test_regbuff_matching() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n" // Registered buffers only
|
||||
"allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n" // Non-registered buffers only
|
||||
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any buffer type (backward compatible)
|
||||
|
||||
create_test_config("test_regbuff.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
|
||||
// Test registered buffer (should match first config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
1, &nChannels); // regBuff = 1 (registered)
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
|
||||
|
||||
// Test non-registered buffer (should match second config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels); // regBuff = 0 (non-registered)
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
|
||||
|
||||
// Test backward compatibility - config without regBuff should match any regBuff value
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
// First try with regBuff=2 (unusual value, should match third config)
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
2, &nChannels); // regBuff = 2 (only third config should match)
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
|
||||
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_regbuff.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 9: numPipeOps matching
|
||||
int test_pipeops_matching() {
|
||||
const char* test_config =
|
||||
"allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n" // Single pipeline op
|
||||
"allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n" // Multiple pipeline ops
|
||||
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any pipeline ops (backward compatible)
|
||||
|
||||
create_test_config("test_pipeops.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
|
||||
// Test single pipeline op (should match first config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
|
||||
|
||||
// Test multiple pipeline ops (should match second config)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
|
||||
TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
|
||||
|
||||
// Test different number of pipeline ops (should match third config - backward compatible)
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
|
||||
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_pipeops.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 10: No matching configuration (fallback behavior)
|
||||
int test_no_match_fallback() {
|
||||
const char* test_config =
|
||||
"broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n"; // Only broadcast config
|
||||
|
||||
create_test_config("test_fallback.conf", test_config);
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
// Try allreduce (should not match, use fallback)
|
||||
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
|
||||
TEST_ASSERT(nChannels == 1, "Should use default channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink("test_fallback.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 11: Large configuration files (testing dynamic allocation)
|
||||
int test_large_config() {
|
||||
const char* large_config_file = "test_large.conf";
|
||||
|
||||
// Create a large configuration file with many entries
|
||||
// This tests the dynamic allocation functionality
|
||||
FILE* f = fopen(large_config_file, "w");
|
||||
TEST_ASSERT(f != NULL, "Should be able to create large config file");
|
||||
|
||||
// Write header comment
|
||||
fprintf(f, "# Large configuration file for testing dynamic allocation\n");
|
||||
fprintf(f, "# This file contains many configurations to test memory allocation\n");
|
||||
|
||||
// Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
|
||||
const int num_configs = 500; // 5x the old static limit
|
||||
const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
|
||||
const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
|
||||
const char* protocols[] = {"simple", "ll", "ll128"};
|
||||
|
||||
for (int i = 0; i < num_configs; i++) {
|
||||
// Vary the configurations to create realistic test data
|
||||
const char* coll = collectives[i % 5];
|
||||
const char* algo = algorithms[i % 4];
|
||||
const char* proto = protocols[i % 3];
|
||||
|
||||
size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
|
||||
size_t max_bytes = min_bytes + 65536; // 64KB range
|
||||
int channels = (i % 8) + 1; // 1-8 channels
|
||||
int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
|
||||
int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
|
||||
int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
|
||||
int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
|
||||
|
||||
fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
|
||||
coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
|
||||
// Set environment to use our large config file
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
|
||||
|
||||
// Initialize plugin with large config
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
|
||||
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||
|
||||
// Verify that configurations were loaded
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
|
||||
TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
|
||||
TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
|
||||
|
||||
// Test that we can access configurations throughout the array
|
||||
// (This would have failed with the old static MAX_CONFIGS=100 limit)
|
||||
for (int i = 0; i < ctx->numConfigs; i++) {
|
||||
TuningConfig* config = &ctx->configs[i];
|
||||
// Basic sanity checks on the loaded configurations
|
||||
TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
|
||||
"Collective type should be valid");
|
||||
TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
|
||||
TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
|
||||
}
|
||||
|
||||
// Test specific configuration access at various indices
|
||||
// Index 0 (first config)
|
||||
TuningConfig* first_config = &ctx->configs[0];
|
||||
TEST_ASSERT(first_config != NULL, "First config should be accessible");
|
||||
|
||||
// Index in middle
|
||||
TuningConfig* mid_config = &ctx->configs[num_configs / 2];
|
||||
TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
|
||||
|
||||
// Index near end (this would have crashed with static array of 100)
|
||||
TuningConfig* late_config = &ctx->configs[num_configs - 1];
|
||||
TEST_ASSERT(late_config != NULL, "Last config should be accessible");
|
||||
|
||||
// Test memory allocation size - verify we didn't over-allocate
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"Memory allocated for %d configurations (%zu bytes total)",
|
||||
ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
|
||||
|
||||
// Test that the plugin can still find matching configurations from the large set
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0; // Default high cost
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
// Try to find a matching configuration - should work with large config set
|
||||
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink(large_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 12: Very large configuration stress test
|
||||
int test_very_large_config_stress() {
|
||||
const char* stress_config_file = "test_stress.conf";
|
||||
|
||||
// Create an even larger configuration file to stress test the implementation
|
||||
FILE* f = fopen(stress_config_file, "w");
|
||||
TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
|
||||
|
||||
fprintf(f, "# Stress test configuration with very large number of entries\n");
|
||||
|
||||
// Generate an extremely large number of configurations
|
||||
const int stress_configs = 2000; // 20x the old static limit
|
||||
|
||||
for (int i = 0; i < stress_configs; i++) {
|
||||
// Create varied but valid configurations
|
||||
fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
|
||||
i * 512, (i * 512) + 1024);
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
|
||||
|
||||
// Test initialization with stress config
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
|
||||
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
|
||||
TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
|
||||
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"Stress test - loaded %d configurations successfully", stress_configs);
|
||||
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||
"Memory usage: %zu bytes for configuration array",
|
||||
stress_configs * sizeof(TuningConfig));
|
||||
|
||||
// Verify we can access configurations throughout the entire range
|
||||
for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
|
||||
TuningConfig* config = &ctx->configs[i];
|
||||
TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
|
||||
TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink(stress_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test 13: Edge case - empty config file
|
||||
int test_empty_config() {
|
||||
const char* empty_config_file = "test_empty.conf";
|
||||
|
||||
// Create empty config file (only comments)
|
||||
create_test_config(empty_config_file,
|
||||
"# Empty configuration file\n"
|
||||
"# No actual configurations\n"
|
||||
"\n"
|
||||
"\n");
|
||||
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
|
||||
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
|
||||
TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
|
||||
TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
|
||||
|
||||
// Test that plugin still works with no configurations (fallback behavior)
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
cost_table_ptr[i] = cost_table[i];
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
cost_table[i][j] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
int nChannels;
|
||||
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
0, &nChannels);
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
unlink(empty_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test runner function pointer type
|
||||
typedef int (*TestFunction)(void);
|
||||
|
||||
// Test registry
|
||||
typedef struct {
|
||||
const char* name;
|
||||
TestFunction func;
|
||||
const char* description;
|
||||
} TestCase;
|
||||
|
||||
// All available tests
|
||||
TestCase test_cases[] = {
|
||||
{"init", test_plugin_init, "Plugin initialization"},
|
||||
{"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
|
||||
{"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
|
||||
{"collective", test_collective_matching, "Collective type matching"},
|
||||
{"size", test_size_matching, "Size range matching"},
|
||||
{"topology", test_topology_matching, "Topology matching"},
|
||||
{"channels", test_default_channels, "Default channels behavior"},
|
||||
{"regbuff", test_regbuff_matching, "Registered buffer matching"},
|
||||
{"pipeops", test_pipeops_matching, "Pipeline operations matching"},
|
||||
{"fallback", test_no_match_fallback, "Fallback behavior"},
|
||||
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
|
||||
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
|
||||
{"empty-config", test_empty_config, "Empty configuration file handling"},
|
||||
{NULL, NULL, NULL} // End marker
|
||||
};
|
||||
|
||||
// Show help/usage information
|
||||
void show_help(const char* program_name) {
|
||||
printf("Usage: %s [test_name ...]\n\n", program_name);
|
||||
printf("Available tests:\n");
|
||||
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||
printf(" %-15s - %s\n", test_cases[i].name, test_cases[i].description);
|
||||
}
|
||||
printf("\nExamples:\n");
|
||||
printf(" %s # Run all tests\n", program_name);
|
||||
printf(" %s init # Run only initialization test\n", program_name);
|
||||
printf(" %s init collective # Run initialization and collective tests\n", program_name);
|
||||
printf(" %s --help # Show this help\n", program_name);
|
||||
}
|
||||
|
||||
// Find test by name
|
||||
TestFunction find_test(const char* name) {
|
||||
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||
if (strcmp(test_cases[i].name, name) == 0) {
|
||||
return test_cases[i].func;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Main test runner
|
||||
int main(int argc, char* argv[]) {
|
||||
int passed = 0, total = 0;
|
||||
|
||||
// Check for help
|
||||
if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
|
||||
show_help(argv[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("Running NCCL Tuner Plugin Unit Tests\n");
|
||||
printf("=====================================\n");
|
||||
|
||||
if (argc == 1) {
|
||||
// No arguments - run all tests
|
||||
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||
total++;
|
||||
passed += test_cases[i].func();
|
||||
}
|
||||
} else {
|
||||
// Run specific tests
|
||||
for (int arg = 1; arg < argc; arg++) {
|
||||
TestFunction test_func = find_test(argv[arg]);
|
||||
if (test_func) {
|
||||
total++;
|
||||
passed += test_func();
|
||||
} else {
|
||||
printf("ERROR: Unknown test '%s'\n", argv[arg]);
|
||||
printf("Use --help to see available tests\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n=====================================\n");
|
||||
printf("Test Results: %d/%d tests passed\n", passed, total);
|
||||
|
||||
if (passed == total) {
|
||||
printf("All tests PASSED!\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("Some tests FAILED!\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur