Merge remote-tracking branch 'nccl/master' into develop

Cette révision appartient à :
BertanDogancay
2025-10-06 15:03:19 -05:00
révisé par Corey Derochie
révision 3f94267f21
51 fichiers modifiés avec 3682 ajouts et 461 suppressions
+46 -8
Voir le fichier
@@ -3,15 +3,53 @@
#
# See LICENSE.txt for license information
#
RCCL_HOME:=../../build/release
HIP_HOME:=/opt/rocm
INC:= -I$(RCCL_HOME)/include/ -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl
PLUGIN_SO:=libnccl-tuner.so
default: $(PLUGIN_SO)
.DEFAULT_GOAL: build
PLUGIN_SO:=libnccl-tuner-example.so
include ../../makefiles/common.mk
SRCDIR ?= $(abspath ../..)
BUILDDIR ?= .
NCCLDIR := $(BUILDDIR)
$(PLUGIN_SO): plugin.c
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
SRC_FILES := $(wildcard *.c)
DST_DIR := $(BUILDDIR)/test/unit/plugins
default: ${BUILDDIR}/$(PLUGIN_SO)
build: ${BUILDDIR}/$(PLUGIN_SO)
${BUILDDIR}/$(PLUGIN_SO): plugin.c
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${BUILDDIR}
$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
# Test targets - delegate to test directory
test:
$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
test-verbose:
$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
# Build tests
test-build:
$(MAKE) -C test all
# Optimize configurations from performance data
optimize-config:
@if [ -z "$(CSV_FILE)" ]; then \
echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
exit 1; \
fi
python3 scripts/optimize_config.py $(CSV_FILE) \
$(if $(OUTPUT),-o $(OUTPUT)) \
$(if $(METRIC),-m $(METRIC)) \
$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
$(if $(DRY_RUN),--dry-run) \
$(if $(NO_HEADER),--no-header)
clean:
rm -f $(PLUGIN_SO)
rm -f ${BUILDDIR}/$(PLUGIN_SO)
$(MAKE) -C test clean
.PHONY: test test-verbose test-build optimize-config clean
+163
Voir le fichier
@@ -0,0 +1,163 @@
# NCCL Example Tuner Plugin
This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
## Features
- **File-based Configuration**: Read tuning parameters from a CSV configuration file
- **Size-based Tuning**: Specify different configurations based on message size ranges
- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
## Building
```bash
make
```
This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
## Configuration File Format
The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
```
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
```
### Parameters
- **collective_type**: The collective operation type
- `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
- Use `0` for minimum and `4294967295` for maximum (covers all sizes)
- **algorithm**: The NCCL algorithm to use
- `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
- **protocol**: The NCCL protocol to use
- `ll`, `ll128`, `simple`
- **channels**: Number of channels (SMs) to use
- Use a positive integer to specify exact channel count
- Use `-1` to keep NCCL's default channel selection
- **nNodes**: Number of nodes to match
- Use a positive integer to match specific node count
- Use `-1` to match any number of nodes
- **nRanks**: Number of ranks to match
- Use a positive integer to match specific rank count
- Use `-1` to match any number of ranks
- **numPipeOps**: Number of pipeline operations to match (optional)
- Use a positive integer to match specific pipeline operation count
- Use `-1` to match any number of pipeline operations
- If omitted, configuration will match any numPipeOps value
- **regBuff**: Whether user buffer can be registered (optional)
- Use `0` to match only non-registered buffers
- Use `1` to match only registered buffers
- Use `-1` to match either registered or non-registered buffers
- If omitted, configuration will match any regBuff value
### Example Configuration
```csv
# Single-node, small allreduce: use tree algorithm, registered buffers only
allreduce,0,65536,tree,simple,2,1,-1,-1,1
# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
allreduce,65537,1048576,ring,simple,4,4,32,1,0
# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
broadcast,0,32768,tree,simple,-1,1,-1
# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
```
Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
### Backward Compatibility
Configurations without the numPipeOps and/or regBuff parameters are fully supported:
- 8 fields: matches any numPipeOps and regBuff values
- 9 fields: matches any regBuff value
- 10 fields: full parameter specification
This ensures existing configuration files continue to work without modification.
## Usage
### Method 1: Default Config File
Place your configuration in `nccl_tuner.conf` in the current working directory.
### Method 2: Environment Variable
Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
```bash
export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
mpirun -np 4 your_nccl_application
```
## Editing Configuration Files
### Generating Configuration Files from Raw Data
A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
### Spreadsheet Tips:
- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
- Save as CSV format (not Excel format) for the plugin to read
- Use data validation to prevent typos in algorithm/protocol names
## Logging
The plugin uses NCCL's logging system. To see tuner-related messages:
```bash
export NCCL_DEBUG=INFO
```
This will show when configurations are loaded and applied, including the topology information.
For detailed debugging output during tuning decisions:
```bash
export NCCL_DEBUG=TRACE
```
This will show verbose information about which configurations are being evaluated and matched.
## Dimension Matching
Configurations are only applied when the topology matches:
- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
## Default Behavior
If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
## Troubleshooting
1. **Config file not found**: Check the file path and permissions
2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory and that `NCCL_TUNER_PLUGIN` either specifies the plugin name, or an absolute path to the plugin shared library.
4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
+45
Voir le fichier
@@ -0,0 +1,45 @@
# NCCL Tuner Configuration File (CSV Format)
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
#
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
# Protocols: ll, ll128, simple
# Channels: number of channels to use, or -1 to keep default
# nNodes: number of nodes to match, or -1 for any number of nodes
# nRanks: number of ranks to match, or -1 for any number of ranks
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
#
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
#
# Examples:
# For single-node configurations with registered buffers
# Small allreduce operations on single node - use tree algorithm, registered buffers
allreduce,0,65536,tree,simple,2,1,-1,-1,1
# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
# Medium allreduce operations - use ring algorithm
allreduce,65537,1048576,ring,simple,4,4,32,1,0
# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
# Single node broadcast - prefer tree, any pipeOps, registered buffers only
broadcast,0,32768,tree,simple,-1,1,-1,-1,1
# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
allgather,0,4294967295,ring,simple,4,2,-1
# ReduceScatter operations
# Small messages on single node, single pipeline op, registered buffers
reducescatter,0,131072,tree,simple,2,1,-1,1,1
# Large messages on any topology, multiple pipeline ops, non-registered buffers
reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
reduce,0,4294967295,tree,simple,-1,-1,-1
+411 -189
Voir le fichier
@@ -5,224 +5,446 @@
************************************************************************/
#include "tuner.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define __hidden __attribute__ ((visibility("hidden")))
#define HOPPER_COMPCAP_IDX 2
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
#define NCCL_HW_PCI 1
#define NCCL_HW_NET 2
#define MAX_LINE_LENGTH 256
static long log2i(long n) {
long l = 0;
while (n>>=1) l++;
return l;
}
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
// CSV field indices for configuration parsing
// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
#define CONFIG_FIELD_COLLTYPE 0
#define CONFIG_FIELD_MINBYTES 1
#define CONFIG_FIELD_MAXBYTES 2
#define CONFIG_FIELD_ALGORITHM 3
#define CONFIG_FIELD_PROTOCOL 4
#define CONFIG_FIELD_CHANNELS 5
#define CONFIG_FIELD_NNODES 6
#define CONFIG_FIELD_NRANKS 7
#define CONFIG_FIELD_PIPEOPS 8 // Optional field
#define CONFIG_FIELD_REGBUFF 9 // Optional field
struct tuningModel {
float hwLat[3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bwRatio[2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
};
// Field count constants
#define CONFIG_FIELDS_REQUIRED 8 // Minimum required fields (up to nRanks)
#define CONFIG_FIELDS_WITH_PIPEOPS 9 // Fields including numPipeOps
#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
#define CONFIG_FIELDS_MAX 10 // Maximum number of fields supported
static struct tuningModel tuning_model = {
{
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
typedef struct {
ncclFunc_t collType;
size_t minBytes;
size_t maxBytes;
int algorithm;
int protocol;
int nChannels;
int nNodes;
int nRanks;
int numPipeOps;
int regBuff;
} TuningConfig;
{
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 0.86 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
typedef struct {
TuningConfig* configs; // Changed from static array to dynamic pointer
int numConfigs;
int maxConfigs; // Added to track allocated size
size_t nRanks;
size_t nNodes;
ncclDebugLogger_t logFunction;
} TunerContext;
{
{ 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1, 0.4, 0.5, 1.0, 0.6, 0.4, 0.6, 0.1, 0.3, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
{ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 0.2, 0.7, 1.0, 1.0, 1.0, 0.8, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, },
},
{
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.2, 0.2, 0.1, 0.5, 0.8, 1.0, 0.2, 0.4, 0.5, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
{ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
},
};
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int protocol, int numPipeOps, float* time, size_t nBytes) {
float bw = bandwidths[collType][algorithm][protocol];
float lat = latencies[collType][algorithm][protocol];
if (bw == 0) {
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(nBytes>>6);
if (algorithm == NCCL_ALGO_TREE) {
if (logSize < 27) bw *= tuning_model.treeCorrectionFactor[protocol][logSize];
else bw *= tuning_model.treeCorrectionFactor[protocol][26];
}
else if (algorithm == NCCL_ALGO_RING) {
if(logSize < 27) bw *= tuning_model.ringCorrectionFactor[protocol][logSize];
else bw *= tuning_model.ringCorrectionFactor[protocol][26];
}
int latCount = 1;
*time = lat * latCount + (nBytes) / (1000 * bw);
return ncclSuccess;
// Parse collective type from string
static ncclFunc_t parseCollType(const char* str) {
if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
return ncclFuncAllReduce; // default
}
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) {
if (nRanks <= 1) return ncclSuccess;
int compCapIndex = HOPPER_COMPCAP_IDX;
int index2 = nNodes <= 2 ? nNodes-1 : 2;
int index1 = nNodes == 1 ? compCapIndex : 1;
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
// Convert collective type to string
static const char* collTypeToString(ncclFunc_t collType) {
switch (collType) {
case ncclFuncBroadcast: return "broadcast";
case ncclFuncReduce: return "reduce";
case ncclFuncAllGather: return "allgather";
case ncclFuncReduceScatter: return "reducescatter";
case ncclFuncAllReduce: return "allreduce";
default: return "unknown";
}
}
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = NCCL_HW_NVLINK;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
// Parse algorithm from string
static int parseAlgorithm(const char* str) {
if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
return NCCL_ALGO_RING; // default
}
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
// Convert algorithm to string
static const char* algorithmToString(int algorithm) {
switch (algorithm) {
case NCCL_ALGO_TREE: return "tree";
case NCCL_ALGO_RING: return "ring";
case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
case NCCL_ALGO_NVLS: return "nvls";
case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
case NCCL_ALGO_PAT: return "pat";
default: return "unknown";
}
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? 12.0 : 12.0; //graphs[a]->bwIntra : graphs[a]->bwInter
if (a == NCCL_ALGO_NVLS) bw = 0.0;
if (a == NCCL_ALGO_NVLS_TREE) bw = 0.0;
if (collnet == 1) bw = 0.0;
int nChannels = 28; //nNodes==1 && MI300
float busBw = nChannels * bw; //comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw
// Various model refinements
if (nNodes <= 2)
busBw *= tuning_model.bwRatio[0][a][p];
else
busBw *= tuning_model.bwRatio[1][a][p];
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && nNodes == 1) { busBw = busBw * 1.65; }
// Parse protocol from string
static int parseProtocol(const char* str) {
if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
return NCCL_PROTO_SIMPLE; // default
}
// Convert bus BW to algorithm BW
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
float ratio = 1.0f;
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
else ratio *= .5;
busBw *= ratio;
}
bandwidths[coll][a][p] = busBw;
latencies[coll][a][p] = baseLat[a][p];
float intraLat = tuning_model.hwLat[intraHw[a]][a][p];
float interLat = tuning_model.hwLat[NCCL_HW_NET][a][p];
// Convert protocol to string
static const char* protocolToString(int protocol) {
switch (protocol) {
case NCCL_PROTO_LL: return "ll";
case NCCL_PROTO_LL128: return "ll128";
case NCCL_PROTO_SIMPLE: return "simple";
default: return "unknown";
}
}
if (a == NCCL_ALGO_RING) {
float lat = tuning_model.hwLat[hw[a]][a][p];
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
latencies[coll][a][p] += lat;
} else {
// Inter-node rings still have to launch nsteps * net overhead.
float netOverhead = 0.0;
if (nNodes > 1) {
netOverhead = 1;
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
}
if (intraLat < netOverhead) intraLat = netOverhead;
latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
latencies[coll][a][p] +=
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
int minimum = 1;
if ((nRanks/nNodes-1) < 1) minimum = (nRanks/nNodes-1);
latencies[coll][a][p] +=
2 * (minimum * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
} else if (a == NCCL_ALGO_NVLS) {
if (nNodes > 1) latencies[coll][a][p] += tuning_model.hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_NVLS_TREE) {
latencies[coll][a][p] += 2*(nNodes-1)*tuning_model.hwLat[NCCL_HW_NET][a][p];
// Helper function to count valid configuration lines in file
static int countConfigLines(const char* filename) {
FILE* file = fopen(filename, "r");
if (!file) {
return 0;
}
char line[MAX_LINE_LENGTH];
int count = 0;
while (fgets(line, sizeof(line), file)) {
// Skip comments and empty lines
if (line[0] == '#' || line[0] == '\n') continue;
// Remove trailing newline
line[strcspn(line, "\n")] = 0;
// Check if line has content
if (strlen(line) > 0) {
count++;
}
}
fclose(file);
return count;
}
// Load configuration from file
static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
FILE* file = fopen(filename, "r");
if (!file) {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
}
return ncclSuccess; // Not finding config file is not an error
}
// First pass: count valid configuration lines
int configCount = countConfigLines(filename);
if (configCount == 0) {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: No valid configurations found in %s", filename);
}
fclose(file);
return ncclSuccess;
}
// Allocate memory for configurations based on actual count
ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
if (!ctx->configs) {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
}
fclose(file);
return ncclSystemError;
}
ctx->maxConfigs = configCount;
ctx->numConfigs = 0;
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
}
// Reset file pointer to beginning
fseek(file, 0, SEEK_SET);
char line[MAX_LINE_LENGTH];
while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
// Skip comments and empty lines
if (line[0] == '#' || line[0] == '\n') continue;
// Remove trailing newline
line[strcspn(line, "\n")] = 0;
// Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
char* token;
char* tokens[CONFIG_FIELDS_MAX];
int tokenCount = 0;
// Make a copy of the line for tokenizing
char lineCopy[MAX_LINE_LENGTH];
strncpy(lineCopy, line, sizeof(lineCopy));
lineCopy[sizeof(lineCopy) - 1] = '\0';
// Tokenize by comma
token = strtok(lineCopy, ",");
while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
// Trim whitespace
while (*token == ' ' || *token == '\t') token++;
char* end = token + strlen(token) - 1;
while (end > token && (*end == ' ' || *end == '\t')) {
*end = '\0';
end--;
}
tokens[tokenCount++] = token;
token = strtok(NULL, ",");
}
// Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
TuningConfig* config = &ctx->configs[ctx->numConfigs];
config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
// numPipeOps is optional (9th field, index 8)
if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
} else {
config->numPipeOps = -1; // -1 means match any numPipeOps
}
// regBuff is optional (10th field, index 9)
if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
} else {
config->regBuff = -1; // -1 means match any regBuff value
}
ctx->numConfigs++;
if (ctx->logFunction) {
if (config->numPipeOps == -1 && config->regBuff == -1) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
config->nChannels, config->nNodes, config->nRanks);
} else if (config->regBuff == -1) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
} else if (config->numPipeOps == -1) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
config->nChannels, config->nNodes, config->nRanks, config->regBuff);
} else {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
}
}
}
}
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
// MNNVL: NVLS not yet supported
algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
algoEnable[NCCL_ALGO_NVLS] = 0;
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int pEnable = protoEnable[p];
if (p == NCCL_PROTO_LL128) {
pEnable = 0;
}
if (pEnable == 0) bandwidths[c][a][p] = 0;
if (algoEnable[a] == 0) bandwidths[c][a][p] = 0;
fclose(file);
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
}
return ncclSuccess;
}
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
if (!ctx) return ncclSystemError;
ctx->configs = NULL; // Initialize to NULL
ctx->numConfigs = 0;
ctx->maxConfigs = 0; // Initialize to 0
ctx->nRanks = nRanks;
ctx->nNodes = nNodes;
ctx->logFunction = logFunction;
if (logFunction) {
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
}
// Try to load config file from environment variable or default location
const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
if (!configFile) {
configFile = "nccl_tuner.conf"; // default config file name
}
ncclResult_t result = loadConfig(ctx, configFile);
if (result != ncclSuccess) {
if (ctx->configs) {
free(ctx->configs); // Clean up allocated memory on error
}
free(ctx);
return result;
}
*context = ctx;
return ncclSuccess;
}
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) {
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
*algorithm = -1;
*protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (p == NCCL_PROTO_LL128) continue;
float time;
ncclTopoGetAlgoTime_Tuner(collType, a, p, numPipeOps, &time, nBytes);
if (time >= 0 && time < minTime) {
*algorithm = a;
*protocol = p;
minTime = time;
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int regBuff, int* nChannels) {
TunerContext* ctx = (TunerContext*)context;
if (!ctx) return ncclInternalError;
// Default channels
*nChannels = 1;
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
}
// Cast the collCostTable pointer to a 2D array to fix the segmentation fault
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
// Look for matching configuration
for (int i = 0; i < ctx->numConfigs; i++) {
TuningConfig* config = &ctx->configs[i];
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
}
// Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
if (config->collType == collType &&
nBytes >= config->minBytes &&
nBytes <= config->maxBytes &&
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
(config->regBuff == -1 || config->regBuff == regBuff)) {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
}
// Check bounds
if (config->algorithm < numAlgo && config->protocol < numProto) {
if (table[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
algorithmToString(config->algorithm), protocolToString(config->protocol),
&table[config->algorithm][config->protocol], table[config->algorithm][config->protocol]);
}
table[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
// Only override channels if not set to -1 (keep default)
if (config->nChannels != -1) {
*nChannels = config->nChannels;
}
if (ctx->logFunction) {
if (config->nChannels == -1) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
config->nNodes, config->nRanks);
} else {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
config->nChannels, config->nNodes, config->nRanks);
}
}
return ncclSuccess;
} else {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
algorithmToString(config->algorithm), protocolToString(config->protocol));
}
}
} else {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
}
}
} else {
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
config->collType == collType,
(nBytes >= config->minBytes && nBytes <= config->maxBytes),
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
(config->regBuff == -1 || config->regBuff == regBuff));
}
}
}
// If no specific config found, apply default behavior
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: No matching config found");
}
return ncclSuccess;
}
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
__hidden ncclResult_t pluginDestroy(void* context) {
if (context) {
TunerContext* ctx = (TunerContext*)context;
if (ctx->configs) {
free(ctx->configs); // Free dynamically allocated configs array
}
free(context);
}
return ncclSuccess;
}
#define PLUGIN_NAME "Example"
+106
Voir le fichier
@@ -0,0 +1,106 @@
# NCCL Tuner Configuration Scripts
This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
## optimize_config.py
A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
### Usage
```bash
python scripts/optimize_config.py [options] <input_csv_file>
```
### Options
- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
- `--no-header`: Don't add header comments to output file
- `--dry-run`: Print configurations without writing to file
### CSV Input Format
The input CSV file should have the following columns:
```csv
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
```
**Required columns:**
- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
- `size_bytes`: Message size in bytes
- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
- `channels`: Number of channels (or `-1` for default)
- `nodes`: Number of nodes (or `-1` for any)
- `ranks`: Number of ranks (or `-1` for any)
- `pipeOps`: Number of pipeline operations (or `-1` for any)
- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
**Optional metrics (must have at least one present):**
- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
- `latency_us`: Latency in microseconds (lower is better)
### Examples
**Basic usage with cost optimization:**
```bash
python scripts/optimize_config.py sample_performance_data.csv
```
**Optimize for bandwidth and write to custom file:**
```bash
python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
```
**Preview configurations without writing:**
```bash
python scripts/optimize_config.py --dry-run performance_data.csv
```
### How It Works
1. **Data Loading**: Reads CSV performance data and validates format
2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
3. **Size Ranges**: Automatically bins data into size ranges for optimization
4. **Optimization**: Finds the best performing configuration for each group/size combination
5. **Output**: Generates NCCL tuner config format and appends to specified file
### Default Size Ranges
The script uses these default size ranges (in bytes):
- Small: 0 - 1,024
- Medium: 1,025 - 65,536
- Large: 65,537 - 1,048,576
- XLarge: 1,048,577 - 16,777,216
- XXLarge: 16,777,217 - 4,294,967,295
### Sample Data
See `sample_performance_data.csv` for an example of the expected input format.
### Integration with NCCL
The generated configuration file can be used directly with the NCCL tuner plugin:
```bash
export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
mpirun -np 8 your_nccl_application
```
### Performance Data Collection
To collect performance data for optimization, you can:
1. **Use NCCL benchmarks** with different algorithm/protocol combinations
2. **Profile your applications** with various tuner settings
3. **Run systematic sweeps** across parameter combinations
4. **Use NCCL debug output** to collect timing information
The key is to have comprehensive data covering:
- Different message sizes (small to large)
- Various topologies (single node, multi-node)
- All relevant algorithm/protocol combinations
- Different channel counts and pipeline configurations
+430
Voir le fichier
@@ -0,0 +1,430 @@
#!/usr/bin/env python3
"""
NCCL Tuner Configuration Optimizer
Reads a CSV file containing performance data across different tuning parameters
and generates optimal NCCL tuner configurations based on the best performing
combinations.
By default, creates growing size ranges that interpolate between the actual data sizes
for each unique dimension (node count, rank count combination). This ensures that
different cluster configurations get their own optimized size boundaries, as
performance characteristics often vary significantly between topologies.
Each dimension gets its own set of ranges starting from 0 and extending to the maximum
size for that dimension, with boundaries at midpoints between consecutive data sizes.
CSV Input Format:
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
Output Format (NCCL Tuner Config):
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
Usage Examples:
# Auto-create dimension-specific interpolated ranges (default)
python3 optimize_config.py data.csv
# Use custom size ranges (applied to all topologies)
python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
# Use hardcoded default ranges (applied to all topologies)
python3 optimize_config.py data.csv --no-auto-ranges
"""
import csv
import argparse
import sys
import os
from collections import defaultdict
from typing import Dict, List, Tuple, Any
class PerformanceData:
def __init__(self, row: Dict[str, str]):
self.collective = row['collective']
self.size_bytes = int(row['size_bytes'])
self.algorithm = row['algorithm']
self.protocol = row['protocol']
self.channels = int(row['channels']) if row['channels'] != '-1' else -1
self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
# Performance metrics
self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better
self.latency_us = float(row.get('latency_us', 0)) # Lower is better
def get_config_key(self) -> Tuple:
"""Generate a key for grouping similar configurations"""
return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
"""Find which size range this data point belongs to for its dimension"""
topology_key = (self.nodes, self.ranks)
# Get size ranges for this dimension, or fall back to default
if topology_key in topology_size_ranges:
size_ranges = topology_size_ranges[topology_key]
elif (-1, -1) in topology_size_ranges:
size_ranges = topology_size_ranges[(-1, -1)]
else:
# Fallback to first available dimension ranges
size_ranges = next(iter(topology_size_ranges.values()))
for min_size, max_size in size_ranges:
if min_size <= self.size_bytes <= max_size:
return (min_size, max_size)
# If no range found, create a single-point range
return (self.size_bytes, self.size_bytes)
class ConfigOptimizer:
def __init__(self, optimization_metric: str = 'latency_us'):
self.optimization_metric = optimization_metric
# Default size ranges - will be overridden by auto-detection
self.size_ranges = [
(0, 1024),
(1025, 64*1024),
(64*1024+1, 1024*1024),
(1024*1024+1, 16*1024*1024),
(16*1024*1024+1, 4*1024*1024*1024-1)
]
self.auto_size_ranges = True
def set_size_ranges(self, ranges: List[Tuple[int, int]]):
"""Set custom size ranges for optimization"""
self.size_ranges = ranges
self.auto_size_ranges = False
def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
"""Create growing size ranges for each unique (nodes, ranks) dimension"""
if not data:
return {(-1, -1): self.size_ranges}
# Group data by dimension (nodes, ranks)
topology_data = defaultdict(list)
for item in data:
topology_key = (item.nodes, item.ranks)
topology_data[topology_key].append(item)
topology_ranges = {}
for topology_key, items in topology_data.items():
nodes, ranks = topology_key
# Extract unique sizes for this dimension and sort them
unique_sizes = sorted(set(item.size_bytes for item in items))
if len(unique_sizes) <= 1:
# Only one size, create a single range from 0 to that size
size = unique_sizes[0] if unique_sizes else 0
ranges = [(0, size)]
else:
# Create growing ranges that interpolate between data points
ranges = []
for i, size in enumerate(unique_sizes):
if i == 0:
# First range: 0 to midpoint between first and second size
if len(unique_sizes) > 1:
next_size = unique_sizes[i + 1]
max_size = (size + next_size) // 2
else:
max_size = size
min_size = 0
elif i == len(unique_sizes) - 1:
# Last range: previous max + 1 to current size (and beyond)
min_size = ranges[-1][1] + 1
max_size = size
else:
# Intermediate ranges: previous max + 1 to midpoint with next size
min_size = ranges[-1][1] + 1
next_size = unique_sizes[i + 1]
max_size = (size + next_size) // 2
ranges.append((min_size, max_size))
topology_ranges[topology_key] = ranges
print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
for i, (min_size, max_size) in enumerate(ranges):
# Count data points that fall in this range for this dimension
count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
if actual_sizes:
size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
if len(actual_sizes) > 3:
size_list += f", ... (+{len(actual_sizes)-3} more)"
print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
return topology_ranges
def load_data(self, csv_file: str) -> List[PerformanceData]:
"""Load performance data from CSV file"""
data = []
try:
with open(csv_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
try:
data.append(PerformanceData(row))
except (ValueError, KeyError) as e:
print(f"Warning: Skipping invalid row: {row} - {e}")
except FileNotFoundError:
print(f"Error: File {csv_file} not found")
sys.exit(1)
except Exception as e:
print(f"Error reading {csv_file}: {e}")
sys.exit(1)
print(f"Loaded {len(data)} performance data points")
# Auto-determine size ranges if enabled
if self.auto_size_ranges and data:
self.topology_size_ranges = self.auto_determine_size_ranges(data)
else:
# Use default ranges for all topologies
self.topology_size_ranges = {(-1, -1): self.size_ranges}
return data
def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
"""Determine if new_data is better than current_best"""
if self.optimization_metric == 'bandwidth_gbps':
return new_data.bandwidth_gbps > current_best.bandwidth_gbps
elif self.optimization_metric == 'latency_us':
return new_data.latency_us < current_best.latency_us
else:
# Default to latency
return new_data.latency_us < current_best.latency_us
def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
"""Find optimal configurations and return as NCCL config strings"""
# Group data by configuration key and size range
grouped_data = defaultdict(lambda: defaultdict(list))
for item in data:
config_key = item.get_config_key()
size_range = item.get_size_range_key(self.topology_size_ranges)
grouped_data[config_key][size_range].append(item)
# Store optimal configurations before combining ranges
optimal_configs = []
for config_key, size_ranges_dict in grouped_data.items():
collective, nodes, ranks, pipeOps, regBuff = config_key
for (min_size, max_size), items in size_ranges_dict.items():
if not items:
continue
# Find the best performing configuration for this size range
best_item = items[0]
for item in items[1:]:
if self.is_better(item, best_item):
best_item = item
# Store the optimal configuration with its range
optimal_configs.append({
'collective': collective,
'min_size': min_size,
'max_size': max_size,
'algorithm': best_item.algorithm,
'protocol': best_item.protocol,
'channels': best_item.channels,
'nodes': best_item.nodes,
'ranks': best_item.ranks,
'pipeOps': best_item.pipeOps,
'regBuff': best_item.regBuff,
'metric_value': getattr(best_item, self.optimization_metric)
})
# Combine sequential ranges with identical tunings
combined_configs = self.combine_sequential_ranges(optimal_configs)
# Generate config strings
configs = []
for config in combined_configs:
config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
configs.append(config_str)
print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
f"({self.optimization_metric}={config['metric_value']:.3f})")
return configs
def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
"""Combine sequential ranges that have identical tuning parameters"""
if not configs:
return configs
# Group by collective and topology (nodes, ranks)
topology_groups = defaultdict(list)
for config in configs:
topology_key = (config['collective'], config['nodes'], config['ranks'],
config['pipeOps'], config['regBuff'])
topology_groups[topology_key].append(config)
combined_configs = []
for topology_key, topology_configs in topology_groups.items():
# Sort by min_size to ensure proper ordering
topology_configs.sort(key=lambda x: x['min_size'])
# Group by tuning parameters (algorithm, protocol, channels)
tuning_groups = defaultdict(list)
for config in topology_configs:
tuning_key = (config['algorithm'], config['protocol'], config['channels'])
tuning_groups[tuning_key].append(config)
# For each tuning group, combine sequential ranges
for tuning_key, tuning_configs in tuning_groups.items():
if not tuning_configs:
continue
# Sort by min_size
tuning_configs.sort(key=lambda x: x['min_size'])
# Combine sequential ranges
current_config = tuning_configs[0].copy()
for next_config in tuning_configs[1:]:
# Check if ranges are adjacent or overlapping
if current_config['max_size'] + 1 >= next_config['min_size']:
# Extend the current range
current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
# Update metric value to the better one
if self.optimization_metric == 'bandwidth_gbps':
if next_config['metric_value'] > current_config['metric_value']:
current_config['metric_value'] = next_config['metric_value']
else: # latency_us or default
if next_config['metric_value'] < current_config['metric_value']:
current_config['metric_value'] = next_config['metric_value']
else:
# Gap between ranges, save current and start new one
combined_configs.append(current_config)
current_config = next_config.copy()
# Add the last configuration
combined_configs.append(current_config)
# Sort final configs by collective, nodes, ranks, then min_size
combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
original_count = len(configs)
combined_count = len(combined_configs)
if combined_count < original_count:
print(f"Combined {original_count} ranges into {combined_count} ranges "
f"(reduced by {original_count - combined_count})")
return combined_configs
def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
"""Append optimized configurations to NCCL tuner config file"""
try:
# Create directory if it doesn't exist
config_dir = os.path.dirname(config_file)
if config_dir and not os.path.exists(config_dir):
os.makedirs(config_dir)
print(f"Created directory: {config_dir}")
# Check if file exists and has content
file_exists = os.path.exists(config_file)
add_separator = False
if file_exists:
with open(config_file, 'r') as f:
content = f.read().strip()
add_separator = len(content) > 0
print(f"Appending to existing file: {config_file}")
else:
print(f"Creating new file: {config_file}")
with open(config_file, 'a') as f:
if add_separator:
f.write("\n\n")
if add_header:
f.write(f"# Optimized configurations generated by optimize_config.py\n")
f.write(f"# Optimization metric: {self.optimization_metric}\n")
f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
for config in configs:
f.write(f"{config}\n")
if file_exists:
print(f"Appended {len(configs)} optimized configurations to {config_file}")
else:
print(f"Created {config_file} with {len(configs)} optimized configurations")
except PermissionError:
print(f"Error: Permission denied writing to {config_file}")
print("Try running with appropriate permissions or choose a different output location")
sys.exit(1)
except OSError as e:
print(f"Error: Cannot create/write to {config_file}: {e}")
print("Check that the path is valid and you have write permissions")
sys.exit(1)
except Exception as e:
print(f"Unexpected error writing to {config_file}: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
parser.add_argument("csv_file", help="Input CSV file with performance data")
parser.add_argument("-o", "--output", default="nccl_tuner.conf",
help="Output NCCL tuner config file (default: nccl_tuner.conf)")
parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
default='latency_us', help="Optimization metric (default: latency_us)")
parser.add_argument("--no-header", action="store_true",
help="Don't add header comments to output file")
parser.add_argument("--dry-run", action="store_true",
help="Print configurations without writing to file")
parser.add_argument("--no-auto-ranges", action="store_true",
help="Disable automatic size range determination (use default ranges)")
parser.add_argument("--size-ranges", type=str,
help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
args = parser.parse_args()
optimizer = ConfigOptimizer(args.metric)
# Handle size range configuration
if args.size_ranges:
# Parse custom size ranges
try:
ranges = []
for range_str in args.size_ranges.split(','):
min_size, max_size = map(int, range_str.split('-'))
ranges.append((min_size, max_size))
optimizer.set_size_ranges(ranges)
print(f"Using custom size ranges: {ranges}")
except ValueError:
print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
sys.exit(1)
elif args.no_auto_ranges:
# Disable auto-ranging
optimizer.auto_size_ranges = False
print("Using default hardcoded size ranges")
else:
# Auto-ranging is enabled by default - creates one bucket per unique size
optimizer.auto_size_ranges = True
print("Auto-ranging enabled: will create one bucket per unique size in data")
# Load and optimize data
data = optimizer.load_data(args.csv_file)
if not data:
print("No valid data found in CSV file")
sys.exit(1)
configs = optimizer.optimize_configurations(data)
if args.dry_run:
print("\nGenerated configurations:")
for config in configs:
print(config)
else:
optimizer.append_to_config_file(configs, args.output, not args.no_header)
if __name__ == "__main__":
main()
+24
Voir le fichier
@@ -0,0 +1,24 @@
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
1 collective size_bytes algorithm protocol channels nodes ranks pipeOps regBuff cost_metric bandwidth_gbps latency_us
2 allreduce 1024 tree simple 2 1 8 -1 -1 0.15 45.2 12.5
3 allreduce 1024 ring simple 4 1 8 -1 -1 0.12 52.1 10.8
4 allreduce 1024 tree ll 2 1 8 -1 -1 0.18 41.3 15.2
5 allreduce 1024 ring ll 4 1 8 -1 -1 0.14 48.7 12.1
6 allreduce 32768 tree simple 2 1 8 -1 -1 0.25 156.8 25.3
7 allreduce 32768 ring simple 4 1 8 -1 -1 0.18 189.2 18.4
8 allreduce 32768 ring ll128 8 1 8 -1 -1 0.16 201.5 16.2
9 allreduce 1048576 ring simple 4 1 8 -1 -1 0.45 425.6 45.1
10 allreduce 1048576 ring ll128 8 1 8 -1 -1 0.38 482.3 38.7
11 allreduce 1048576 nvls simple 16 1 8 -1 -1 0.32 551.2 32.1
12 broadcast 1024 tree simple 2 1 8 -1 -1 0.08 89.4 8.2
13 broadcast 1024 ring simple 4 1 8 -1 -1 0.12 71.3 12.1
14 broadcast 32768 tree simple 2 1 8 -1 -1 0.18 234.7 18.5
15 broadcast 32768 ring ll128 4 1 8 -1 -1 0.15 267.8 15.2
16 broadcast 1048576 ring simple 4 1 8 -1 -1 0.35 612.4 35.1
17 broadcast 1048576 ring ll128 8 1 8 -1 -1 0.28 702.1 28.3
18 allreduce 1024 tree simple 2 2 16 -1 -1 0.22 38.1 22.4
19 allreduce 1024 ring simple 4 2 16 -1 -1 0.19 42.7 19.6
20 allreduce 32768 ring simple 4 2 16 -1 -1 0.28 145.2 28.1
21 allreduce 32768 ring ll128 8 2 16 -1 -1 0.24 167.8 24.3
22 allreduce 1048576 ring simple 4 2 16 -1 -1 0.58 387.5 58.2
23 allreduce 1048576 ring ll128 8 2 16 -1 -1 0.48 456.9 48.1
24 allreduce 1048576 nvls simple 16 2 16 -1 -1 0.42 512.6 42.3
+30
Voir le fichier
@@ -0,0 +1,30 @@
#
# Makefile for NCCL Tuner Plugin Unit Tests
#
CC := gcc
CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
INC := -I. -I../nccl
TARGET := test_plugin
SOURCES := test_plugin.c
# Default target
all: $(TARGET)
# Build the test executable
$(TARGET): $(SOURCES)
$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
# Run the tests
test: $(TARGET)
./$(TARGET) $(TEST_CASE)
# Run tests with verbose output
test-verbose: $(TARGET)
NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
# Clean build artifacts
clean:
rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
.PHONY: all test test-verbose clean
+205
Voir le fichier
@@ -0,0 +1,205 @@
# NCCL Tuner Plugin Unit Tests
This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
## Test Structure
```
test/
├── test_plugin.c # Main unit test file
├── Makefile # Build system for tests
└── README.md # This file
```
## Building and Running Tests
### Quick Start
```bash
# Build and run all tests
make test
# Or step by step
make # Build test executable
./test_plugin # Run tests
```
### Advanced Testing
```bash
# Run with memory leak detection (requires valgrind)
make test-memory
# Run with verbose logging
make test-verbose
# Generate code coverage report (requires gcov)
make coverage
# Create sample test configuration files
make test-configs
```
## Test Coverage
The unit tests cover the following functionality:
### 1. **Plugin Initialization (`test_plugin_init`)**
- Tests successful plugin initialization
- Verifies context allocation
- Tests cleanup on destroy
### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
- Valid CSV format parsing
- Comment and empty line handling
- Invalid format graceful handling
- Environment variable configuration
### 3. **Collective Type Matching (`test_collective_matching`)**
- Correct matching of allreduce, broadcast, etc.
- Algorithm/protocol selection
- Channel configuration
### 4. **Size Range Matching (`test_size_matching`)**
- Small, medium, large message size handling
- Proper range boundary checking
- Multiple size-based configurations
### 5. **Topology Matching (`test_topology_matching`)**
- Single-node vs multi-node configurations
- Exact nNodes/nRanks matching
- Wildcard matching (-1 values)
### 6. **Default Channels (`test_default_channels`)**
- Proper handling of -1 channel specification
- Preservation of NCCL default behavior
### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
- Configurations based on regBuff parameter
- Registered vs non-registered buffer handling
- Backward compatibility with configs missing regBuff
### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
- Configurations based on numPipeOps parameter
- Single vs multiple pipeline operation handling
- Backward compatibility with configs missing numPipeOps
### 9. **Fallback Behavior (`test_no_match_fallback`)**
- Default behavior when no config matches
- Ring/Simple algorithm fallback
## Test Output
Successful test run:
```
Running NCCL Tuner Plugin Unit Tests
=====================================
PASS: test_plugin_init
PASS: test_config_parsing_valid
PASS: test_config_parsing_invalid
PASS: test_collective_matching
PASS: test_size_matching
PASS: test_topology_matching
PASS: test_default_channels
PASS: test_regbuff_matching
PASS: test_pipeops_matching
PASS: test_no_match_fallback
=====================================
Test Results: 9/9 tests passed
All tests PASSED!
```
Failed test example:
```
FAIL: test_collective_matching - Tree/Simple should have low cost
Test Results: 8/9 tests passed
Some tests FAILED!
```
## Mock NCCL Implementation
The tests use the actual NCCL header files from the `../nccl/` directory:
- `tuner.h` - Complete NCCL tuner interface and type definitions
- `common.h` - Common NCCL types and logging functions
- `err.h` - NCCL error codes
This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
## Integration with CI/CD
```bash
# Install tests for CI/CD pipeline
make install-test
# Run as part of automated testing
make test && echo "Tests passed" || echo "Tests failed"
```
## Memory Testing
The tests can be run with valgrind for memory leak detection:
```bash
make test-memory
```
This will detect:
- Memory leaks
- Invalid memory access
- Use of uninitialized memory
## Code Coverage
Generate code coverage reports to ensure comprehensive testing:
```bash
make coverage
# Creates test_plugin.c.gcov with line-by-line coverage
```
## Adding New Tests
To add a new test:
1. Create a new test function in `test_plugin.c`:
```c
int test_new_feature() {
// Test setup
TEST_ASSERT(condition, "description");
// Test cleanup
TEST_PASS();
}
```
2. Add the test to the main function:
```c
total++; passed += test_new_feature();
```
3. Rebuild and run:
```bash
make test
```
## Debugging Tests
For debugging failed tests:
```bash
# Compile with debug symbols
make CFLAGS="-g -O0 -DDEBUG"
# Run with gdb
gdb ./test_plugin
```
## Cleaning Up
```bash
# Remove all build artifacts and temporary files
make clean
```
This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
+856
Voir le fichier
@@ -0,0 +1,856 @@
/*************************************************************************
* Unit tests for NCCL Tuner Plugin
************************************************************************/
#define _GNU_SOURCE // Enable setenv/unsetenv and other GNU extensions
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <sys/stat.h>
#include <stdarg.h>
// Include NCCL tuner header (which includes common.h and err.h)
#include "tuner.h"
// Include plugin source for testing
#include "../plugin.c"
// Test framework macros
#define TEST_ASSERT(condition, message) \
do { \
if (!(condition)) { \
printf("FAIL: %s - %s\n", __func__, message); \
return 0; \
} \
} while(0)
#define TEST_PASS() \
do { \
printf("PASS: %s\n", __func__); \
return 1; \
} while(0)
// Global test state
static int test_log_count = 0;
// Mock logger function
void mock_logger(ncclDebugLogLevel level, unsigned long flags,
const char* file, int line, const char* fmt, ...) {
(void)flags; // Suppress unused parameter warning
test_log_count++;
// Check if we should print based on NCCL_DEBUG level
const char* debug_level = getenv("NCCL_DEBUG");
int should_print = 0;
if (debug_level) {
if (strcmp(debug_level, "TRACE") == 0) {
should_print = 1; // Print everything
} else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
should_print = 1; // Print INFO and below
} else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
should_print = 1; // Print WARN and below
}
}
if (!should_print) return;
// Convert log level to string
const char* level_str;
switch(level) {
case NCCL_LOG_NONE: level_str = "NONE"; break;
case NCCL_LOG_VERSION: level_str = "VERSION"; break;
case NCCL_LOG_WARN: level_str = "WARN"; break;
case NCCL_LOG_INFO: level_str = "INFO"; break;
case NCCL_LOG_ABORT: level_str = "ABORT"; break;
case NCCL_LOG_TRACE: level_str = "TRACE"; break;
default: level_str = "UNKNOWN"; break;
}
// Print log header
printf("[TUNER:%s:%s:%d] ", level_str, file, line);
// Print formatted message
va_list args;
va_start(args, fmt);
vprintf(fmt, args);
va_end(args);
printf("\n");
}
// Helper function to create test config file
void create_test_config(const char* filename, const char* content) {
FILE* f = fopen(filename, "w");
if (f) {
fprintf(f, "%s", content);
fclose(f);
}
}
// Test 1: Plugin initialization
int test_plugin_init() {
void* context = NULL;
// Test successful initialization
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
// Clean up
pluginDestroy(context);
TEST_PASS();
}
// Test 2: Configuration file parsing - valid CSV
int test_config_parsing_valid() {
const char* test_config =
"# Test configuration\n"
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
"broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
"# Comment line\n"
"\n" // Empty line
"reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
create_test_config("test_valid.conf", test_config);
// Set environment variable to use our test config
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
// Clean up
pluginDestroy(context);
unlink("test_valid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 3: Configuration file parsing - invalid CSV
int test_config_parsing_invalid() {
const char* test_config =
"allreduce,0,65536,tree,simple,2,1 # Missing nRanks and other fields\n"
"invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
"broadcast,abc,def,ring,simple,1,1,1,-1,-1\n"; // Invalid numbers
create_test_config("test_invalid.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
// Should still succeed but with no valid configs loaded
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
// Clean up
pluginDestroy(context);
unlink("test_invalid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 4: Collective type matching
int test_collective_matching() {
const char* test_config =
"allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
"broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
create_test_config("test_match.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
// Create mock cost table
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0; // Default high cost
}
}
int nChannels;
// Test allreduce matching (should match first config)
ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
TEST_ASSERT(nChannels == 8, "Should set 8 channels");
// Test broadcast matching (should match second config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0; // Reset costs
}
}
result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
// Clean up
pluginDestroy(context);
unlink("test_match.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 5: Size range matching
int test_size_matching() {
const char* test_config =
"allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
"allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
"allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
create_test_config("test_size.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
int nChannels = 1;
pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
// Test medium message (should match second config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
// Test large message (should match third config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
// Clean up
pluginDestroy(context);
unlink("test_size.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 6: Topology matching
int test_topology_matching() {
const char* test_config =
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" // Single node only
"allreduce,0,65536,ring,simple,4,4,32,-1,-1\n" // 4 nodes, 32 ranks exactly
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any topology
create_test_config("test_topo.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
// Test with single node setup
void* context1 = NULL;
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
int nChannels;
pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
pluginDestroy(context1);
// Test with 4 nodes, 32 ranks setup
void* context2 = NULL;
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
// Clean up
unlink("test_topo.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 7: Default channels behavior (-1)
int test_default_channels() {
const char* test_config =
"allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n"; // Use default channels
create_test_config("test_default.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
int nChannels = 99; // Set to known value
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
// Clean up
pluginDestroy(context);
unlink("test_default.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 8: regBuff matching
int test_regbuff_matching() {
const char* test_config =
"allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n" // Registered buffers only
"allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n" // Non-registered buffers only
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any buffer type (backward compatible)
create_test_config("test_regbuff.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
}
int nChannels;
// Test registered buffer (should match first config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
1, &nChannels); // regBuff = 1 (registered)
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
// Test non-registered buffer (should match second config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels); // regBuff = 0 (non-registered)
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
// Test backward compatibility - config without regBuff should match any regBuff value
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
// First try with regBuff=2 (unusual value, should match third config)
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
2, &nChannels); // regBuff = 2 (only third config should match)
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
// Clean up
pluginDestroy(context);
unlink("test_regbuff.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 9: numPipeOps matching
int test_pipeops_matching() {
const char* test_config =
"allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n" // Single pipeline op
"allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n" // Multiple pipeline ops
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any pipeline ops (backward compatible)
create_test_config("test_pipeops.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
}
int nChannels;
// Test single pipeline op (should match first config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
// Test multiple pipeline ops (should match second config)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
// Test different number of pipeline ops (should match third config - backward compatible)
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
// Clean up
pluginDestroy(context);
unlink("test_pipeops.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 10: No matching configuration (fallback behavior)
int test_no_match_fallback() {
const char* test_config =
"broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n"; // Only broadcast config
create_test_config("test_fallback.conf", test_config);
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
int nChannels;
// Try allreduce (should not match, use fallback)
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
TEST_ASSERT(nChannels == 1, "Should use default channels");
// Clean up
pluginDestroy(context);
unlink("test_fallback.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 11: Large configuration files (testing dynamic allocation)
int test_large_config() {
const char* large_config_file = "test_large.conf";
// Create a large configuration file with many entries
// This tests the dynamic allocation functionality
FILE* f = fopen(large_config_file, "w");
TEST_ASSERT(f != NULL, "Should be able to create large config file");
// Write header comment
fprintf(f, "# Large configuration file for testing dynamic allocation\n");
fprintf(f, "# This file contains many configurations to test memory allocation\n");
// Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
const int num_configs = 500; // 5x the old static limit
const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
const char* protocols[] = {"simple", "ll", "ll128"};
for (int i = 0; i < num_configs; i++) {
// Vary the configurations to create realistic test data
const char* coll = collectives[i % 5];
const char* algo = algorithms[i % 4];
const char* proto = protocols[i % 3];
size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
size_t max_bytes = min_bytes + 65536; // 64KB range
int channels = (i % 8) + 1; // 1-8 channels
int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
}
fclose(f);
// Set environment to use our large config file
setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
// Initialize plugin with large config
void* context = NULL;
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
// Verify that configurations were loaded
TunerContext* ctx = (TunerContext*)context;
TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
// Test that we can access configurations throughout the array
// (This would have failed with the old static MAX_CONFIGS=100 limit)
for (int i = 0; i < ctx->numConfigs; i++) {
TuningConfig* config = &ctx->configs[i];
// Basic sanity checks on the loaded configurations
TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
"Collective type should be valid");
TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
}
// Test specific configuration access at various indices
// Index 0 (first config)
TuningConfig* first_config = &ctx->configs[0];
TEST_ASSERT(first_config != NULL, "First config should be accessible");
// Index in middle
TuningConfig* mid_config = &ctx->configs[num_configs / 2];
TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
// Index near end (this would have crashed with static array of 100)
TuningConfig* late_config = &ctx->configs[num_configs - 1];
TEST_ASSERT(late_config != NULL, "Last config should be accessible");
// Test memory allocation size - verify we didn't over-allocate
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"Memory allocated for %d configurations (%zu bytes total)",
ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
// Test that the plugin can still find matching configurations from the large set
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0; // Default high cost
}
}
int nChannels;
// Try to find a matching configuration - should work with large config set
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
// Clean up
pluginDestroy(context);
unlink(large_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 12: Very large configuration stress test
int test_very_large_config_stress() {
const char* stress_config_file = "test_stress.conf";
// Create an even larger configuration file to stress test the implementation
FILE* f = fopen(stress_config_file, "w");
TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
fprintf(f, "# Stress test configuration with very large number of entries\n");
// Generate an extremely large number of configurations
const int stress_configs = 2000; // 20x the old static limit
for (int i = 0; i < stress_configs; i++) {
// Create varied but valid configurations
fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
i * 512, (i * 512) + 1024);
}
fclose(f);
setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
// Test initialization with stress config
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
TunerContext* ctx = (TunerContext*)context;
TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"Stress test - loaded %d configurations successfully", stress_configs);
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
"Memory usage: %zu bytes for configuration array",
stress_configs * sizeof(TuningConfig));
// Verify we can access configurations throughout the entire range
for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
TuningConfig* config = &ctx->configs[i];
TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
}
// Clean up
pluginDestroy(context);
unlink(stress_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test 13: Edge case - empty config file
int test_empty_config() {
const char* empty_config_file = "test_empty.conf";
// Create empty config file (only comments)
create_test_config(empty_config_file,
"# Empty configuration file\n"
"# No actual configurations\n"
"\n"
"\n");
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
TunerContext* ctx = (TunerContext*)context;
TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
// Test that plugin still works with no configurations (fallback behavior)
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
cost_table_ptr[i] = cost_table[i];
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
cost_table[i][j] = 1.0;
}
}
int nChannels;
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
0, &nChannels);
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
// Clean up
pluginDestroy(context);
unlink(empty_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test runner function pointer type
typedef int (*TestFunction)(void);
// Test registry
typedef struct {
const char* name;
TestFunction func;
const char* description;
} TestCase;
// All available tests
TestCase test_cases[] = {
{"init", test_plugin_init, "Plugin initialization"},
{"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
{"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
{"collective", test_collective_matching, "Collective type matching"},
{"size", test_size_matching, "Size range matching"},
{"topology", test_topology_matching, "Topology matching"},
{"channels", test_default_channels, "Default channels behavior"},
{"regbuff", test_regbuff_matching, "Registered buffer matching"},
{"pipeops", test_pipeops_matching, "Pipeline operations matching"},
{"fallback", test_no_match_fallback, "Fallback behavior"},
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
{"empty-config", test_empty_config, "Empty configuration file handling"},
{NULL, NULL, NULL} // End marker
};
// Show help/usage information
void show_help(const char* program_name) {
printf("Usage: %s [test_name ...]\n\n", program_name);
printf("Available tests:\n");
for (int i = 0; test_cases[i].name != NULL; i++) {
printf(" %-15s - %s\n", test_cases[i].name, test_cases[i].description);
}
printf("\nExamples:\n");
printf(" %s # Run all tests\n", program_name);
printf(" %s init # Run only initialization test\n", program_name);
printf(" %s init collective # Run initialization and collective tests\n", program_name);
printf(" %s --help # Show this help\n", program_name);
}
// Find test by name
TestFunction find_test(const char* name) {
for (int i = 0; test_cases[i].name != NULL; i++) {
if (strcmp(test_cases[i].name, name) == 0) {
return test_cases[i].func;
}
}
return NULL;
}
// Main test runner
int main(int argc, char* argv[]) {
int passed = 0, total = 0;
// Check for help
if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
show_help(argv[0]);
return 0;
}
printf("Running NCCL Tuner Plugin Unit Tests\n");
printf("=====================================\n");
if (argc == 1) {
// No arguments - run all tests
for (int i = 0; test_cases[i].name != NULL; i++) {
total++;
passed += test_cases[i].func();
}
} else {
// Run specific tests
for (int arg = 1; arg < argc; arg++) {
TestFunction test_func = find_test(argv[arg]);
if (test_func) {
total++;
passed += test_func();
} else {
printf("ERROR: Unknown test '%s'\n", argv[arg]);
printf("Use --help to see available tests\n");
return 1;
}
}
}
printf("\n=====================================\n");
printf("Test Results: %d/%d tests passed\n", passed, total);
if (passed == total) {
printf("All tests PASSED!\n");
return 0;
} else {
printf("Some tests FAILED!\n");
return 1;
}
}