Merge remote-tracking branch 'nccl/master' into develop

2025-10-06 15:03:19 -05:00
Parent feee02ca61 593de54e52
@@ -3,15 +3,53 @@
 #
 # See LICENSE.txt for license information
 #
-RCCL_HOME:=../../build/release
-HIP_HOME:=/opt/rocm
-INC:= -I$(RCCL_HOME)/include/  -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl
-PLUGIN_SO:=libnccl-tuner.so

-default: $(PLUGIN_SO)
+.DEFAULT_GOAL: build
+PLUGIN_SO:=libnccl-tuner-example.so
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)

-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+default: ${BUILDDIR}/$(PLUGIN_SO)
+
+build: ${BUILDDIR}/$(PLUGIN_SO)
+
+${BUILDDIR}/$(PLUGIN_SO): plugin.c
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Test targets - delegate to test directory
+test:
+	$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
+
+test-verbose:
+	$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
+
+# Build tests
+test-build:
+	$(MAKE) -C test all
+
+# Optimize configurations from performance data
+optimize-config:
+	@if [ -z "$(CSV_FILE)" ]; then \
+		echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
+		echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
+		exit 1; \
+	fi
+	python3 scripts/optimize_config.py $(CSV_FILE) \
+		$(if $(OUTPUT),-o $(OUTPUT)) \
+		$(if $(METRIC),-m $(METRIC)) \
+		$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
+		$(if $(DRY_RUN),--dry-run) \
+		$(if $(NO_HEADER),--no-header)

 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/$(PLUGIN_SO)
+	$(MAKE) -C test clean
+
+.PHONY: test test-verbose test-build optimize-config clean
@@ -0,0 +1,163 @@
+# NCCL Example Tuner Plugin
+
+This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
+
+## Features
+
+- **File-based Configuration**: Read tuning parameters from a CSV configuration file
+- **Size-based Tuning**: Specify different configurations based on message size ranges
+- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
+- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
+- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
+- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
+
+## Building
+
+```bash
+make
+```
+
+This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
+
+## Configuration File Format
+
+The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
+
+```
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+```
+
+### Parameters
+
+- **collective_type**: The collective operation type
+  - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
+
+- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
+  - Use `0` for minimum and `4294967295` for maximum (covers all sizes)
+
+- **algorithm**: The NCCL algorithm to use
+  - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
+
+- **protocol**: The NCCL protocol to use
+  - `ll`, `ll128`, `simple`
+
+- **channels**: Number of channels (SMs) to use
+  - Use a positive integer to specify exact channel count
+  - Use `-1` to keep NCCL's default channel selection
+
+- **nNodes**: Number of nodes to match
+  - Use a positive integer to match specific node count
+  - Use `-1` to match any number of nodes
+
+- **nRanks**: Number of ranks to match
+  - Use a positive integer to match specific rank count
+  - Use `-1` to match any number of ranks
+
+- **numPipeOps**: Number of pipeline operations to match (optional)
+  - Use a positive integer to match specific pipeline operation count
+  - Use `-1` to match any number of pipeline operations
+  - If omitted, configuration will match any numPipeOps value
+
+- **regBuff**: Whether user buffer can be registered (optional)
+  - Use `0` to match only non-registered buffers
+  - Use `1` to match only registered buffers
+  - Use `-1` to match either registered or non-registered buffers
+  - If omitted, configuration will match any regBuff value
+
+### Example Configuration
+
+```csv
+# Single-node, small allreduce: use tree algorithm, registered buffers only
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
+broadcast,0,32768,tree,simple,-1,1,-1
+
+# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+```
+
+Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
+
+### Backward Compatibility
+
+Configurations without the numPipeOps and/or regBuff parameters are fully supported:
+- 8 fields: matches any numPipeOps and regBuff values
+- 9 fields: matches any regBuff value
+- 10 fields: full parameter specification
+
+This ensures existing configuration files continue to work without modification.
+
+## Usage
+
+### Method 1: Default Config File
+Place your configuration in `nccl_tuner.conf` in the current working directory.
+
+### Method 2: Environment Variable
+Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
+mpirun -np 4 your_nccl_application
+```
+
+## Editing Configuration Files
+
+### Generating Configuration Files from Raw Data
+
+A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
+
+### Spreadsheet Tips:
+- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
+- Save as CSV format (not Excel format) for the plugin to read
+- Use data validation to prevent typos in algorithm/protocol names
+
+## Logging
+
+The plugin uses NCCL's logging system. To see tuner-related messages:
+
+```bash
+export NCCL_DEBUG=INFO
+```
+
+This will show when configurations are loaded and applied, including the topology information.
+
+For detailed debugging output during tuning decisions:
+
+```bash
+export NCCL_DEBUG=TRACE
+```
+
+This will show verbose information about which configurations are being evaluated and matched.
+
+## Dimension Matching
+
+Configurations are only applied when the topology matches:
+
+- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
+- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
+- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
+- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
+
+This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
+
+## Default Behavior
+
+If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
+
+When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
+
+## Troubleshooting
+
+1. **Config file not found**: Check the file path and permissions
+2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
+3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory and that `NCCL_TUNER_PLUGIN` either specifies the plugin name, or an absolute path to the plugin shared library.
+4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
+5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
+6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
@@ -0,0 +1,45 @@
+# NCCL Tuner Configuration File (CSV Format)
+# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#
+# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
+# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
+# Protocols: ll, ll128, simple
+# Channels: number of channels to use, or -1 to keep default
+# nNodes: number of nodes to match, or -1 for any number of nodes
+# nRanks: number of ranks to match, or -1 for any number of ranks
+# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
+# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
+#
+# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
+#
+# Examples:
+
+# For single-node configurations with registered buffers
+# Small allreduce operations on single node - use tree algorithm, registered buffers
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
+# Medium allreduce operations - use ring algorithm
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
+# Single node broadcast - prefer tree, any pipeOps, registered buffers only
+broadcast,0,32768,tree,simple,-1,1,-1,-1,1
+
+# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+
+# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
+allgather,0,4294967295,ring,simple,4,2,-1
+
+# ReduceScatter operations
+# Small messages on single node, single pipeline op, registered buffers
+reducescatter,0,131072,tree,simple,2,1,-1,1,1
+# Large messages on any topology, multiple pipeline ops, non-registered buffers
+reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
+
+# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
+reduce,0,4294967295,tree,simple,-1,-1,-1
@@ -5,224 +5,446 @@
 ************************************************************************/

 #include "tuner.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
 #define __hidden __attribute__ ((visibility("hidden")))
-#define HOPPER_COMPCAP_IDX 2
-// NVLink, PCI, Network
-#define NCCL_HW_NVLINK 0
-#define NCCL_HW_PCI 1
-#define NCCL_HW_NET 2
+#define MAX_LINE_LENGTH 256

-static long log2i(long n) {
- long l = 0;
- while (n>>=1) l++;
- return l;
-}
-// Latencies in us, Bandwidths in GB/s
-// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { 
-       { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 },   // Tree, Ring
-       { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 },   // Collnet Direct, Chain
-       {    0,    0,    0 }, {    0,    0,    0 }};  // NVLS, NVLS Tree
+// CSV field indices for configuration parsing
+// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#define CONFIG_FIELD_COLLTYPE     0
+#define CONFIG_FIELD_MINBYTES     1
+#define CONFIG_FIELD_MAXBYTES     2
+#define CONFIG_FIELD_ALGORITHM    3
+#define CONFIG_FIELD_PROTOCOL     4
+#define CONFIG_FIELD_CHANNELS     5
+#define CONFIG_FIELD_NNODES       6
+#define CONFIG_FIELD_NRANKS       7
+#define CONFIG_FIELD_PIPEOPS      8  // Optional field
+#define CONFIG_FIELD_REGBUFF      9  // Optional field

-struct tuningModel {
-  float hwLat[3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float bwRatio[2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
-  float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
-};
+// Field count constants
+#define CONFIG_FIELDS_REQUIRED    8   // Minimum required fields (up to nRanks)
+#define CONFIG_FIELDS_WITH_PIPEOPS 9  // Fields including numPipeOps
+#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
+#define CONFIG_FIELDS_MAX         10  // Maximum number of fields supported

-static struct tuningModel tuning_model = {
-  {
-    /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
-    /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
-    /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
-  },
+typedef struct {
+  ncclFunc_t collType;
+  size_t minBytes;
+  size_t maxBytes;
+  int algorithm;
+  int protocol;
+  int nChannels;
+  int nNodes;
+  int nRanks;
+  int numPipeOps;
+  int regBuff;
+} TuningConfig;

-  {
-    /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
-    /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 0.86 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
-  },
+typedef struct {
+  TuningConfig* configs;  // Changed from static array to dynamic pointer
+  int numConfigs;
+  int maxConfigs;         // Added to track allocated size
+  size_t nRanks;
+  size_t nNodes;
+  ncclDebugLogger_t logFunction;
+} TunerContext;

-  {
-    { 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1, 0.4, 0.5, 1.0, 0.6, 0.4, 0.6, 0.1, 0.3, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
-    { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
-    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 0.2, 0.7, 1.0, 1.0, 1.0, 0.8, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, },
-  },
-
-  {
-    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.2, 0.2, 0.1, 0.5, 0.8, 1.0, 0.2, 0.4, 0.5, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, },
-    { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, },
-    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
-  },
-};
-
-float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-
-ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int protocol, int numPipeOps, float* time, size_t nBytes) {
-  float bw = bandwidths[collType][algorithm][protocol];
-  float lat = latencies[collType][algorithm][protocol];
-
-  if (bw == 0) {
-    *time = -1.0; return ncclSuccess;
-  }
-  int logSize = log2i(nBytes>>6);
-  if (algorithm == NCCL_ALGO_TREE) {
-    if (logSize < 27) bw *= tuning_model.treeCorrectionFactor[protocol][logSize];
-    else bw *= tuning_model.treeCorrectionFactor[protocol][26];
-  }
-  else if (algorithm == NCCL_ALGO_RING) {
-    if(logSize < 27) bw *= tuning_model.ringCorrectionFactor[protocol][logSize];
-    else bw *= tuning_model.ringCorrectionFactor[protocol][26];
-  }
-
-  int latCount = 1;
-  *time = lat * latCount + (nBytes) / (1000 * bw);
-  return ncclSuccess;
+// Parse collective type from string
+static ncclFunc_t parseCollType(const char* str) {
+  if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
+  if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
+  if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
+  if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
+  if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
+  return ncclFuncAllReduce; // default
 }

-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { 
-  if (nRanks <= 1) return ncclSuccess;
-  int compCapIndex = HOPPER_COMPCAP_IDX;
-  int index2 = nNodes <= 2 ? nNodes-1 : 2;
-  int index1 = nNodes == 1 ? compCapIndex : 1;
-  float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+// Convert collective type to string
+static const char* collTypeToString(ncclFunc_t collType) {
+  switch (collType) {
+    case ncclFuncBroadcast: return "broadcast";
+    case ncclFuncReduce: return "reduce";
+    case ncclFuncAllGather: return "allgather";
+    case ncclFuncReduceScatter: return "reducescatter";
+    case ncclFuncAllReduce: return "allreduce";
+    default: return "unknown";
+  }
+}

-  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = NCCL_HW_NVLINK;
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
-  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
-    int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
-      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
-      nRanks;
-    int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
-      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
-      nNodes;
+// Parse algorithm from string
+static int parseAlgorithm(const char* str) {
+  if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
+  if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
+  if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
+  if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
+  if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
+  if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
+  if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
+  return NCCL_ALGO_RING; // default
+}

-    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
-      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
+// Convert algorithm to string
+static const char* algorithmToString(int algorithm) {
+  switch (algorithm) {
+    case NCCL_ALGO_TREE: return "tree";
+    case NCCL_ALGO_RING: return "ring";
+    case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
+    case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
+    case NCCL_ALGO_NVLS: return "nvls";
+    case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
+    case NCCL_ALGO_PAT: return "pat";
+    default: return "unknown";
+  }
+}

-      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 1) continue;
-        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
-        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
-        float bw = nNodes <= 2 || collnet ? 12.0 : 12.0; //graphs[a]->bwIntra : graphs[a]->bwInter
-        if (a == NCCL_ALGO_NVLS) bw = 0.0;
-        if (a == NCCL_ALGO_NVLS_TREE) bw = 0.0;
-        if (collnet == 1) bw = 0.0;
-        int nChannels = 28; //nNodes==1 && MI300
-        float busBw = nChannels * bw; //comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw
-        
-        // Various model refinements
-        if (nNodes <= 2)
-          busBw *= tuning_model.bwRatio[0][a][p];
-        else
-          busBw *= tuning_model.bwRatio[1][a][p];
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && nNodes == 1) { busBw = busBw * 1.65; }
+// Parse protocol from string
+static int parseProtocol(const char* str) {
+  if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
+  if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
+  if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
+  return NCCL_PROTO_SIMPLE; // default
+}

-        // Convert bus BW to algorithm BW
-        if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
-          float ratio = 1.0f;
-          if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
-          else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
-          else ratio *= .5;
-          busBw *= ratio;
-        }
-        bandwidths[coll][a][p] = busBw;
-        latencies[coll][a][p] = baseLat[a][p];
-        float intraLat = tuning_model.hwLat[intraHw[a]][a][p];
-        float interLat = tuning_model.hwLat[NCCL_HW_NET][a][p];
+// Convert protocol to string
+static const char* protocolToString(int protocol) {
+  switch (protocol) {
+    case NCCL_PROTO_LL: return "ll";
+    case NCCL_PROTO_LL128: return "ll128";
+    case NCCL_PROTO_SIMPLE: return "simple";
+    default: return "unknown";
+  }
+}

-        if (a == NCCL_ALGO_RING) {
-          float lat = tuning_model.hwLat[hw[a]][a][p];
-          if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
-            latencies[coll][a][p] += lat;
-          } else {
-            // Inter-node rings still have to launch nsteps * net overhead.
-            float netOverhead = 0.0;
-            if (nNodes > 1) {
-              netOverhead = 1;
-              if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
-            }
-            if (intraLat < netOverhead) intraLat = netOverhead;
-            latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
-          }
-        } else if (a == NCCL_ALGO_TREE) {
-          latencies[coll][a][p] +=
-            2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
-        } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
-          int minimum = 1;
-          if ((nRanks/nNodes-1) < 1) minimum = (nRanks/nNodes-1);
-          latencies[coll][a][p] +=
-            2 * (minimum * intraLat + (nRanks/nNodes-1) * 0.4) + interLat;  // Add 0.4 us arity serialization latency
-        } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
-          latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
-        } else if (a == NCCL_ALGO_NVLS) {
-          if (nNodes > 1) latencies[coll][a][p] += tuning_model.hwLat[NCCL_HW_NET][a][p];
-        } else if (a == NCCL_ALGO_NVLS_TREE) {
-          latencies[coll][a][p] += 2*(nNodes-1)*tuning_model.hwLat[NCCL_HW_NET][a][p];
+// Helper function to count valid configuration lines in file
+static int countConfigLines(const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return 0;
+  }
+
+  char line[MAX_LINE_LENGTH];
+  int count = 0;
+
+  while (fgets(line, sizeof(line), file)) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Check if line has content
+    if (strlen(line) > 0) {
+      count++;
+    }
+  }
+
+  fclose(file);
+  return count;
+}
+
+// Load configuration from file
+static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
+    }
+    return ncclSuccess; // Not finding config file is not an error
+  }
+
+  // First pass: count valid configuration lines
+  int configCount = countConfigLines(filename);
+  if (configCount == 0) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: No valid configurations found in %s", filename);
+    }
+    fclose(file);
+    return ncclSuccess;
+  }
+
+  // Allocate memory for configurations based on actual count
+  ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
+  if (!ctx->configs) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
+    }
+    fclose(file);
+    return ncclSystemError;
+  }
+
+  ctx->maxConfigs = configCount;
+  ctx->numConfigs = 0;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
+  }
+
+  // Reset file pointer to beginning
+  fseek(file, 0, SEEK_SET);
+
+  char line[MAX_LINE_LENGTH];
+
+  while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+    char* token;
+    char* tokens[CONFIG_FIELDS_MAX];
+    int tokenCount = 0;
+
+    // Make a copy of the line for tokenizing
+    char lineCopy[MAX_LINE_LENGTH];
+    strncpy(lineCopy, line, sizeof(lineCopy));
+    lineCopy[sizeof(lineCopy) - 1] = '\0';
+
+    // Tokenize by comma
+    token = strtok(lineCopy, ",");
+    while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
+      // Trim whitespace
+      while (*token == ' ' || *token == '\t') token++;
+      char* end = token + strlen(token) - 1;
+      while (end > token && (*end == ' ' || *end == '\t')) {
+        *end = '\0';
+        end--;
+      }
+      tokens[tokenCount++] = token;
+      token = strtok(NULL, ",");
+    }
+
+    // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
+    if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
+      TuningConfig* config = &ctx->configs[ctx->numConfigs];
+      config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
+      config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
+      config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
+      config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
+      config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
+      config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
+      config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
+      config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
+
+      // numPipeOps is optional (9th field, index 8)
+      if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
+        config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
+      } else {
+        config->numPipeOps = -1; // -1 means match any numPipeOps
+      }
+
+      // regBuff is optional (10th field, index 9)
+      if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
+        config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
+      } else {
+        config->regBuff = -1; // -1 means match any regBuff value
+      }
+
+      ctx->numConfigs++;
+
+      if (ctx->logFunction) {
+        if (config->numPipeOps == -1 && config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks);
+        } else if (config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
+        } else if (config->numPipeOps == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->regBuff);
+        } else {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
        }
      }
    }
  }
-  // Protocols/Algorithms enable/disable, and user overrides.
-  // All are enabled except ll128 which is enabled by default only in certain cases.
-  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };

-  // MNNVL: NVLS not yet supported
-  algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
-  algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
-  algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
-  algoEnable[NCCL_ALGO_NVLS] = 0;
-
-  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    int pEnable = protoEnable[p];
-    if (p == NCCL_PROTO_LL128) {
-      pEnable = 0;
-    }
-    if (pEnable == 0) bandwidths[c][a][p] = 0;
-    if (algoEnable[a] == 0) bandwidths[c][a][p] = 0;
+  fclose(file);
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
  }
  return ncclSuccess;
 }

+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
+  if (!ctx) return ncclSystemError;
+
+  ctx->configs = NULL;     // Initialize to NULL
+  ctx->numConfigs = 0;
+  ctx->maxConfigs = 0;     // Initialize to 0
+  ctx->nRanks = nRanks;
+  ctx->nNodes = nNodes;
+  ctx->logFunction = logFunction;
+
+  if (logFunction) {
+    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+  }
+
+  // Try to load config file from environment variable or default location
+  const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
+  if (!configFile) {
+    configFile = "nccl_tuner.conf"; // default config file name
+  }
+
+  ncclResult_t result = loadConfig(ctx, configFile);
+  if (result != ncclSuccess) {
+    if (ctx->configs) {
+      free(ctx->configs);  // Clean up allocated memory on error
+    }
+    free(ctx);
+    return result;
+  }
+
+  *context = ctx;
+  return ncclSuccess;
+}
+
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels) {
-                                
-  float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
-  // Find algorithm / protocol.
-  *algorithm = -1;
-  *protocol = -1;
-  int nAlgos = NCCL_NUM_ALGORITHMS;
-  for (int a=0; a<nAlgos; a++) {
-    if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
-    if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
-    if (a == NCCL_ALGO_NVLS && collNetSupport != 1) continue;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      if (p == NCCL_PROTO_LL128) continue;
-      float time;
-      ncclTopoGetAlgoTime_Tuner(collType, a, p, numPipeOps, &time, nBytes);
-        if (time >= 0 && time < minTime) {
-          *algorithm = a;
-          *protocol = p;
-          minTime = time;
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  TunerContext* ctx = (TunerContext*)context;
+  if (!ctx) return ncclInternalError;
+
+  // Default channels
+  *nChannels = 1;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
+                     collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
+  }
+
+  // Cast the collCostTable pointer to a 2D array to fix the segmentation fault
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+
+  // Look for matching configuration
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
+                       i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                       config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+    }
+
+    // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
+    if (config->collType == collType &&
+        nBytes >= config->minBytes &&
+        nBytes <= config->maxBytes &&
+        (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
+        (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
+        (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
+        (config->regBuff == -1 || config->regBuff == regBuff)) {
+
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
+                         algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
+      }
+
+      // Check bounds
+      if (config->algorithm < numAlgo && config->protocol < numProto) {
+        if (table[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol),
+                             &table[config->algorithm][config->protocol], table[config->algorithm][config->protocol]);
+          }
+          table[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
+
+          // Only override channels if not set to -1 (keep default)
+          if (config->nChannels != -1) {
+            *nChannels = config->nChannels;
+          }
+
+          if (ctx->logFunction) {
+            if (config->nChannels == -1) {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nNodes, config->nRanks);
+            } else {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nChannels, config->nNodes, config->nRanks);
+            }
+          }
+          return ncclSuccess;
+        } else {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol));
+          }
        }
+      } else {
+        if (ctx->logFunction) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
+                           algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
+        }
+      }
+    } else {
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
+                         config->collType == collType,
+                         (nBytes >= config->minBytes && nBytes <= config->maxBytes),
+                         (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
+                         (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
+                         (config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
+                         (config->regBuff == -1 || config->regBuff == regBuff));
+      }
    }
  }
+
+  // If no specific config found, apply default behavior
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: No matching config found");
+  }
+
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+__hidden ncclResult_t pluginDestroy(void* context) {
+  if (context) {
+    TunerContext* ctx = (TunerContext*)context;
+    if (ctx->configs) {
+      free(ctx->configs);  // Free dynamically allocated configs array
+    }
+    free(context);
+  }
+  return ncclSuccess;
+}

 #define PLUGIN_NAME "Example"

@@ -0,0 +1,106 @@
+# NCCL Tuner Configuration Scripts
+
+This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
+
+## optimize_config.py
+
+A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
+
+### Usage
+
+```bash
+python scripts/optimize_config.py [options] <input_csv_file>
+```
+
+### Options
+
+- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
+- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
+- `--no-header`: Don't add header comments to output file
+- `--dry-run`: Print configurations without writing to file
+
+### CSV Input Format
+
+The input CSV file should have the following columns:
+
+```csv
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+```
+
+**Required columns:**
+- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
+- `size_bytes`: Message size in bytes
+- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
+- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
+- `channels`: Number of channels (or `-1` for default)
+- `nodes`: Number of nodes (or `-1` for any)
+- `ranks`: Number of ranks (or `-1` for any)
+- `pipeOps`: Number of pipeline operations (or `-1` for any)
+- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
+
+**Optional metrics (must have at least one present):**
+- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
+- `latency_us`: Latency in microseconds (lower is better)
+
+### Examples
+
+**Basic usage with cost optimization:**
+```bash
+python scripts/optimize_config.py sample_performance_data.csv
+```
+
+**Optimize for bandwidth and write to custom file:**
+```bash
+python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
+```
+
+**Preview configurations without writing:**
+```bash
+python scripts/optimize_config.py --dry-run performance_data.csv
+```
+
+### How It Works
+
+1. **Data Loading**: Reads CSV performance data and validates format
+2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
+3. **Size Ranges**: Automatically bins data into size ranges for optimization
+4. **Optimization**: Finds the best performing configuration for each group/size combination
+5. **Output**: Generates NCCL tuner config format and appends to specified file
+
+### Default Size Ranges
+
+The script uses these default size ranges (in bytes):
+- Small: 0 - 1,024
+- Medium: 1,025 - 65,536
+- Large: 65,537 - 1,048,576
+- XLarge: 1,048,577 - 16,777,216
+- XXLarge: 16,777,217 - 4,294,967,295
+
+### Sample Data
+
+See `sample_performance_data.csv` for an example of the expected input format.
+
+### Integration with NCCL
+
+The generated configuration file can be used directly with the NCCL tuner plugin:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
+export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
+mpirun -np 8 your_nccl_application
+```
+
+### Performance Data Collection
+
+To collect performance data for optimization, you can:
+
+1. **Use NCCL benchmarks** with different algorithm/protocol combinations
+2. **Profile your applications** with various tuner settings
+3. **Run systematic sweeps** across parameter combinations
+4. **Use NCCL debug output** to collect timing information
+
+The key is to have comprehensive data covering:
+- Different message sizes (small to large)
+- Various topologies (single node, multi-node)
+- All relevant algorithm/protocol combinations
+- Different channel counts and pipeline configurations
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+NCCL Tuner Configuration Optimizer
+
+Reads a CSV file containing performance data across different tuning parameters
+and generates optimal NCCL tuner configurations based on the best performing
+combinations.
+
+By default, creates growing size ranges that interpolate between the actual data sizes
+for each unique dimension (node count, rank count combination). This ensures that
+different cluster configurations get their own optimized size boundaries, as
+performance characteristics often vary significantly between topologies.
+
+Each dimension gets its own set of ranges starting from 0 and extending to the maximum
+size for that dimension, with boundaries at midpoints between consecutive data sizes.
+
+CSV Input Format:
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
+
+Output Format (NCCL Tuner Config):
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+
+Usage Examples:
+  # Auto-create dimension-specific interpolated ranges (default)
+  python3 optimize_config.py data.csv
+
+  # Use custom size ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
+
+  # Use hardcoded default ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --no-auto-ranges
+"""
+
+import csv
+import argparse
+import sys
+import os
+from collections import defaultdict
+from typing import Dict, List, Tuple, Any
+
+class PerformanceData:
+    def __init__(self, row: Dict[str, str]):
+        self.collective = row['collective']
+        self.size_bytes = int(row['size_bytes'])
+        self.algorithm = row['algorithm']
+        self.protocol = row['protocol']
+        self.channels = int(row['channels']) if row['channels'] != '-1' else -1
+        self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
+        self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
+        self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
+        self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
+
+        # Performance metrics
+        self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0))  # Higher is better
+        self.latency_us = float(row.get('latency_us', 0))  # Lower is better
+
+    def get_config_key(self) -> Tuple:
+        """Generate a key for grouping similar configurations"""
+        return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
+
+    def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
+        """Find which size range this data point belongs to for its dimension"""
+        topology_key = (self.nodes, self.ranks)
+
+        # Get size ranges for this dimension, or fall back to default
+        if topology_key in topology_size_ranges:
+            size_ranges = topology_size_ranges[topology_key]
+        elif (-1, -1) in topology_size_ranges:
+            size_ranges = topology_size_ranges[(-1, -1)]
+        else:
+            # Fallback to first available dimension ranges
+            size_ranges = next(iter(topology_size_ranges.values()))
+
+        for min_size, max_size in size_ranges:
+            if min_size <= self.size_bytes <= max_size:
+                return (min_size, max_size)
+        # If no range found, create a single-point range
+        return (self.size_bytes, self.size_bytes)
+
+class ConfigOptimizer:
+    def __init__(self, optimization_metric: str = 'latency_us'):
+        self.optimization_metric = optimization_metric
+        # Default size ranges - will be overridden by auto-detection
+        self.size_ranges = [
+            (0, 1024),
+            (1025, 64*1024),
+            (64*1024+1, 1024*1024),
+            (1024*1024+1, 16*1024*1024),
+            (16*1024*1024+1, 4*1024*1024*1024-1)
+        ]
+        self.auto_size_ranges = True
+
+    def set_size_ranges(self, ranges: List[Tuple[int, int]]):
+        """Set custom size ranges for optimization"""
+        self.size_ranges = ranges
+        self.auto_size_ranges = False
+
+    def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
+        """Create growing size ranges for each unique (nodes, ranks) dimension"""
+        if not data:
+            return {(-1, -1): self.size_ranges}
+
+        # Group data by dimension (nodes, ranks)
+        topology_data = defaultdict(list)
+        for item in data:
+            topology_key = (item.nodes, item.ranks)
+            topology_data[topology_key].append(item)
+
+        topology_ranges = {}
+
+        for topology_key, items in topology_data.items():
+            nodes, ranks = topology_key
+
+            # Extract unique sizes for this dimension and sort them
+            unique_sizes = sorted(set(item.size_bytes for item in items))
+
+            if len(unique_sizes) <= 1:
+                # Only one size, create a single range from 0 to that size
+                size = unique_sizes[0] if unique_sizes else 0
+                ranges = [(0, size)]
+            else:
+                # Create growing ranges that interpolate between data points
+                ranges = []
+
+                for i, size in enumerate(unique_sizes):
+                    if i == 0:
+                        # First range: 0 to midpoint between first and second size
+                        if len(unique_sizes) > 1:
+                            next_size = unique_sizes[i + 1]
+                            max_size = (size + next_size) // 2
+                        else:
+                            max_size = size
+                        min_size = 0
+                    elif i == len(unique_sizes) - 1:
+                        # Last range: previous max + 1 to current size (and beyond)
+                        min_size = ranges[-1][1] + 1
+                        max_size = size
+                    else:
+                        # Intermediate ranges: previous max + 1 to midpoint with next size
+                        min_size = ranges[-1][1] + 1
+                        next_size = unique_sizes[i + 1]
+                        max_size = (size + next_size) // 2
+
+                    ranges.append((min_size, max_size))
+
+            topology_ranges[topology_key] = ranges
+
+            print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
+            for i, (min_size, max_size) in enumerate(ranges):
+                # Count data points that fall in this range for this dimension
+                count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
+                actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
+                if actual_sizes:
+                    size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
+                    if len(actual_sizes) > 3:
+                        size_list += f", ... (+{len(actual_sizes)-3} more)"
+                    print(f"  Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
+
+        return topology_ranges
+
+    def load_data(self, csv_file: str) -> List[PerformanceData]:
+        """Load performance data from CSV file"""
+        data = []
+        try:
+            with open(csv_file, 'r') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    try:
+                        data.append(PerformanceData(row))
+                    except (ValueError, KeyError) as e:
+                        print(f"Warning: Skipping invalid row: {row} - {e}")
+        except FileNotFoundError:
+            print(f"Error: File {csv_file} not found")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error reading {csv_file}: {e}")
+            sys.exit(1)
+
+        print(f"Loaded {len(data)} performance data points")
+
+        # Auto-determine size ranges if enabled
+        if self.auto_size_ranges and data:
+            self.topology_size_ranges = self.auto_determine_size_ranges(data)
+        else:
+            # Use default ranges for all topologies
+            self.topology_size_ranges = {(-1, -1): self.size_ranges}
+
+        return data
+
+    def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
+        """Determine if new_data is better than current_best"""
+        if self.optimization_metric == 'bandwidth_gbps':
+            return new_data.bandwidth_gbps > current_best.bandwidth_gbps
+        elif self.optimization_metric == 'latency_us':
+            return new_data.latency_us < current_best.latency_us
+        else:
+            # Default to latency
+            return new_data.latency_us < current_best.latency_us
+
+    def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
+        """Find optimal configurations and return as NCCL config strings"""
+        # Group data by configuration key and size range
+        grouped_data = defaultdict(lambda: defaultdict(list))
+
+        for item in data:
+            config_key = item.get_config_key()
+            size_range = item.get_size_range_key(self.topology_size_ranges)
+            grouped_data[config_key][size_range].append(item)
+
+        # Store optimal configurations before combining ranges
+        optimal_configs = []
+
+        for config_key, size_ranges_dict in grouped_data.items():
+            collective, nodes, ranks, pipeOps, regBuff = config_key
+
+            for (min_size, max_size), items in size_ranges_dict.items():
+                if not items:
+                    continue
+
+                # Find the best performing configuration for this size range
+                best_item = items[0]
+                for item in items[1:]:
+                    if self.is_better(item, best_item):
+                        best_item = item
+
+                # Store the optimal configuration with its range
+                optimal_configs.append({
+                    'collective': collective,
+                    'min_size': min_size,
+                    'max_size': max_size,
+                    'algorithm': best_item.algorithm,
+                    'protocol': best_item.protocol,
+                    'channels': best_item.channels,
+                    'nodes': best_item.nodes,
+                    'ranks': best_item.ranks,
+                    'pipeOps': best_item.pipeOps,
+                    'regBuff': best_item.regBuff,
+                    'metric_value': getattr(best_item, self.optimization_metric)
+                })
+
+        # Combine sequential ranges with identical tunings
+        combined_configs = self.combine_sequential_ranges(optimal_configs)
+
+        # Generate config strings
+        configs = []
+        for config in combined_configs:
+            config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
+            configs.append(config_str)
+
+            print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
+                  f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
+                  f"({self.optimization_metric}={config['metric_value']:.3f})")
+
+        return configs
+
+    def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
+        """Combine sequential ranges that have identical tuning parameters"""
+        if not configs:
+            return configs
+
+        # Group by collective and topology (nodes, ranks)
+        topology_groups = defaultdict(list)
+        for config in configs:
+            topology_key = (config['collective'], config['nodes'], config['ranks'],
+                          config['pipeOps'], config['regBuff'])
+            topology_groups[topology_key].append(config)
+
+        combined_configs = []
+
+        for topology_key, topology_configs in topology_groups.items():
+            # Sort by min_size to ensure proper ordering
+            topology_configs.sort(key=lambda x: x['min_size'])
+
+            # Group by tuning parameters (algorithm, protocol, channels)
+            tuning_groups = defaultdict(list)
+            for config in topology_configs:
+                tuning_key = (config['algorithm'], config['protocol'], config['channels'])
+                tuning_groups[tuning_key].append(config)
+
+            # For each tuning group, combine sequential ranges
+            for tuning_key, tuning_configs in tuning_groups.items():
+                if not tuning_configs:
+                    continue
+
+                # Sort by min_size
+                tuning_configs.sort(key=lambda x: x['min_size'])
+
+                # Combine sequential ranges
+                current_config = tuning_configs[0].copy()
+
+                for next_config in tuning_configs[1:]:
+                    # Check if ranges are adjacent or overlapping
+                    if current_config['max_size'] + 1 >= next_config['min_size']:
+                        # Extend the current range
+                        current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
+                        # Update metric value to the better one
+                        if self.optimization_metric == 'bandwidth_gbps':
+                            if next_config['metric_value'] > current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                        else:  # latency_us or default
+                            if next_config['metric_value'] < current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                    else:
+                        # Gap between ranges, save current and start new one
+                        combined_configs.append(current_config)
+                        current_config = next_config.copy()
+
+                # Add the last configuration
+                combined_configs.append(current_config)
+
+        # Sort final configs by collective, nodes, ranks, then min_size
+        combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
+
+        original_count = len(configs)
+        combined_count = len(combined_configs)
+        if combined_count < original_count:
+            print(f"Combined {original_count} ranges into {combined_count} ranges "
+                  f"(reduced by {original_count - combined_count})")
+
+        return combined_configs
+
+    def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
+        """Append optimized configurations to NCCL tuner config file"""
+        try:
+            # Create directory if it doesn't exist
+            config_dir = os.path.dirname(config_file)
+            if config_dir and not os.path.exists(config_dir):
+                os.makedirs(config_dir)
+                print(f"Created directory: {config_dir}")
+
+            # Check if file exists and has content
+            file_exists = os.path.exists(config_file)
+            add_separator = False
+
+            if file_exists:
+                with open(config_file, 'r') as f:
+                    content = f.read().strip()
+                    add_separator = len(content) > 0
+                print(f"Appending to existing file: {config_file}")
+            else:
+                print(f"Creating new file: {config_file}")
+
+            with open(config_file, 'a') as f:
+                if add_separator:
+                    f.write("\n\n")
+
+                if add_header:
+                    f.write(f"# Optimized configurations generated by optimize_config.py\n")
+                    f.write(f"# Optimization metric: {self.optimization_metric}\n")
+                    f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
+
+                for config in configs:
+                    f.write(f"{config}\n")
+
+            if file_exists:
+                print(f"Appended {len(configs)} optimized configurations to {config_file}")
+            else:
+                print(f"Created {config_file} with {len(configs)} optimized configurations")
+
+        except PermissionError:
+            print(f"Error: Permission denied writing to {config_file}")
+            print("Try running with appropriate permissions or choose a different output location")
+            sys.exit(1)
+        except OSError as e:
+            print(f"Error: Cannot create/write to {config_file}: {e}")
+            print("Check that the path is valid and you have write permissions")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Unexpected error writing to {config_file}: {e}")
+            sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
+    parser.add_argument("csv_file", help="Input CSV file with performance data")
+    parser.add_argument("-o", "--output", default="nccl_tuner.conf",
+                       help="Output NCCL tuner config file (default: nccl_tuner.conf)")
+    parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
+                       default='latency_us', help="Optimization metric (default: latency_us)")
+    parser.add_argument("--no-header", action="store_true",
+                       help="Don't add header comments to output file")
+    parser.add_argument("--dry-run", action="store_true",
+                       help="Print configurations without writing to file")
+    parser.add_argument("--no-auto-ranges", action="store_true",
+                       help="Disable automatic size range determination (use default ranges)")
+    parser.add_argument("--size-ranges", type=str,
+                       help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
+
+    args = parser.parse_args()
+
+    optimizer = ConfigOptimizer(args.metric)
+
+    # Handle size range configuration
+    if args.size_ranges:
+        # Parse custom size ranges
+        try:
+            ranges = []
+            for range_str in args.size_ranges.split(','):
+                min_size, max_size = map(int, range_str.split('-'))
+                ranges.append((min_size, max_size))
+            optimizer.set_size_ranges(ranges)
+            print(f"Using custom size ranges: {ranges}")
+        except ValueError:
+            print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
+            sys.exit(1)
+    elif args.no_auto_ranges:
+        # Disable auto-ranging
+        optimizer.auto_size_ranges = False
+        print("Using default hardcoded size ranges")
+    else:
+        # Auto-ranging is enabled by default - creates one bucket per unique size
+        optimizer.auto_size_ranges = True
+        print("Auto-ranging enabled: will create one bucket per unique size in data")
+
+    # Load and optimize data
+    data = optimizer.load_data(args.csv_file)
+    if not data:
+        print("No valid data found in CSV file")
+        sys.exit(1)
+
+    configs = optimizer.optimize_configurations(data)
+
+    if args.dry_run:
+        print("\nGenerated configurations:")
+        for config in configs:
+            print(config)
+    else:
+        optimizer.append_to_config_file(configs, args.output, not args.no_header)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,24 @@
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
+allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
+allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
+allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
+allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
+allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
+allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
+allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
+allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
+allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
+broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
+broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
+broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
+broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
+broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
+broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
+allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
+allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
+allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
+allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
+allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
+allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
+allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
@@ -0,0 +1,30 @@
+#
+# Makefile for NCCL Tuner Plugin Unit Tests
+#
+
+CC := gcc
+CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
+INC := -I. -I../nccl
+TARGET := test_plugin
+SOURCES := test_plugin.c
+
+# Default target
+all: $(TARGET)
+
+# Build the test executable
+$(TARGET): $(SOURCES)
+	$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
+
+# Run the tests
+test: $(TARGET)
+	./$(TARGET) $(TEST_CASE)
+
+# Run tests with verbose output
+test-verbose: $(TARGET)
+	NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
+
+# Clean build artifacts
+clean:
+	rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
+
+.PHONY: all test test-verbose clean
@@ -0,0 +1,205 @@
+# NCCL Tuner Plugin Unit Tests
+
+This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
+
+## Test Structure
+
+```
+test/
+├── test_plugin.c     # Main unit test file
+├── Makefile          # Build system for tests
+└── README.md         # This file
+```
+
+## Building and Running Tests
+
+### Quick Start
+
+```bash
+# Build and run all tests
+make test
+
+# Or step by step
+make           # Build test executable
+./test_plugin  # Run tests
+```
+
+### Advanced Testing
+
+```bash
+# Run with memory leak detection (requires valgrind)
+make test-memory
+
+# Run with verbose logging
+make test-verbose
+
+# Generate code coverage report (requires gcov)
+make coverage
+
+# Create sample test configuration files
+make test-configs
+```
+
+## Test Coverage
+
+The unit tests cover the following functionality:
+
+### 1. **Plugin Initialization (`test_plugin_init`)**
+- Tests successful plugin initialization
+- Verifies context allocation
+- Tests cleanup on destroy
+
+### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
+- Valid CSV format parsing
+- Comment and empty line handling
+- Invalid format graceful handling
+- Environment variable configuration
+
+### 3. **Collective Type Matching (`test_collective_matching`)**
+- Correct matching of allreduce, broadcast, etc.
+- Algorithm/protocol selection
+- Channel configuration
+
+### 4. **Size Range Matching (`test_size_matching`)**
+- Small, medium, large message size handling
+- Proper range boundary checking
+- Multiple size-based configurations
+
+### 5. **Topology Matching (`test_topology_matching`)**
+- Single-node vs multi-node configurations
+- Exact nNodes/nRanks matching
+- Wildcard matching (-1 values)
+
+### 6. **Default Channels (`test_default_channels`)**
+- Proper handling of -1 channel specification
+- Preservation of NCCL default behavior
+
+### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
+- Configurations based on regBuff parameter
+- Registered vs non-registered buffer handling
+- Backward compatibility with configs missing regBuff
+
+### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
+- Configurations based on numPipeOps parameter
+- Single vs multiple pipeline operation handling
+- Backward compatibility with configs missing numPipeOps
+
+### 9. **Fallback Behavior (`test_no_match_fallback`)**
+- Default behavior when no config matches
+- Ring/Simple algorithm fallback
+
+## Test Output
+
+Successful test run:
+```
+Running NCCL Tuner Plugin Unit Tests
+=====================================
+PASS: test_plugin_init
+PASS: test_config_parsing_valid
+PASS: test_config_parsing_invalid
+PASS: test_collective_matching
+PASS: test_size_matching
+PASS: test_topology_matching
+PASS: test_default_channels
+PASS: test_regbuff_matching
+PASS: test_pipeops_matching
+PASS: test_no_match_fallback
+
+=====================================
+Test Results: 9/9 tests passed
+All tests PASSED!
+```
+
+Failed test example:
+```
+FAIL: test_collective_matching - Tree/Simple should have low cost
+Test Results: 8/9 tests passed
+Some tests FAILED!
+```
+
+## Mock NCCL Implementation
+
+The tests use the actual NCCL header files from the `../nccl/` directory:
+
+- `tuner.h` - Complete NCCL tuner interface and type definitions
+- `common.h` - Common NCCL types and logging functions
+- `err.h` - NCCL error codes
+
+This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
+
+## Integration with CI/CD
+
+```bash
+# Install tests for CI/CD pipeline
+make install-test
+
+# Run as part of automated testing
+make test && echo "Tests passed" || echo "Tests failed"
+```
+
+## Memory Testing
+
+The tests can be run with valgrind for memory leak detection:
+
+```bash
+make test-memory
+```
+
+This will detect:
+- Memory leaks
+- Invalid memory access
+- Use of uninitialized memory
+
+## Code Coverage
+
+Generate code coverage reports to ensure comprehensive testing:
+
+```bash
+make coverage
+# Creates test_plugin.c.gcov with line-by-line coverage
+```
+
+## Adding New Tests
+
+To add a new test:
+
+1. Create a new test function in `test_plugin.c`:
+```c
+int test_new_feature() {
+  // Test setup
+  TEST_ASSERT(condition, "description");
+  // Test cleanup
+  TEST_PASS();
+}
+```
+
+2. Add the test to the main function:
+```c
+total++; passed += test_new_feature();
+```
+
+3. Rebuild and run:
+```bash
+make test
+```
+
+## Debugging Tests
+
+For debugging failed tests:
+
+```bash
+# Compile with debug symbols
+make CFLAGS="-g -O0 -DDEBUG"
+
+# Run with gdb
+gdb ./test_plugin
+```
+
+## Cleaning Up
+
+```bash
+# Remove all build artifacts and temporary files
+make clean
+```
+
+This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
@@ -0,0 +1,856 @@
+/*************************************************************************
+ * Unit tests for NCCL Tuner Plugin
+ ************************************************************************/
+
+#define _GNU_SOURCE  // Enable setenv/unsetenv and other GNU extensions
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+
+
+// Include NCCL tuner header (which includes common.h and err.h)
+#include "tuner.h"
+
+// Include plugin source for testing
+#include "../plugin.c"
+
+// Test framework macros
+#define TEST_ASSERT(condition, message) \
+  do { \
+    if (!(condition)) { \
+      printf("FAIL: %s - %s\n", __func__, message); \
+      return 0; \
+    } \
+  } while(0)
+
+#define TEST_PASS() \
+  do { \
+    printf("PASS: %s\n", __func__); \
+    return 1; \
+  } while(0)
+
+// Global test state
+static int test_log_count = 0;
+
+// Mock logger function
+void mock_logger(ncclDebugLogLevel level, unsigned long flags,
+                 const char* file, int line, const char* fmt, ...) {
+  (void)flags; // Suppress unused parameter warning
+  test_log_count++;
+
+  // Check if we should print based on NCCL_DEBUG level
+  const char* debug_level = getenv("NCCL_DEBUG");
+  int should_print = 0;
+
+  if (debug_level) {
+    if (strcmp(debug_level, "TRACE") == 0) {
+      should_print = 1; // Print everything
+    } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
+      should_print = 1; // Print INFO and below
+    } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
+      should_print = 1; // Print WARN and below
+    }
+  }
+
+  if (!should_print) return;
+
+  // Convert log level to string
+  const char* level_str;
+  switch(level) {
+    case NCCL_LOG_NONE: level_str = "NONE"; break;
+    case NCCL_LOG_VERSION: level_str = "VERSION"; break;
+    case NCCL_LOG_WARN: level_str = "WARN"; break;
+    case NCCL_LOG_INFO: level_str = "INFO"; break;
+    case NCCL_LOG_ABORT: level_str = "ABORT"; break;
+    case NCCL_LOG_TRACE: level_str = "TRACE"; break;
+    default: level_str = "UNKNOWN"; break;
+  }
+
+  // Print log header
+  printf("[TUNER:%s:%s:%d] ", level_str, file, line);
+
+  // Print formatted message
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+}
+
+// Helper function to create test config file
+void create_test_config(const char* filename, const char* content) {
+  FILE* f = fopen(filename, "w");
+  if (f) {
+    fprintf(f, "%s", content);
+    fclose(f);
+  }
+}
+
+// Test 1: Plugin initialization
+int test_plugin_init() {
+  void* context = NULL;
+
+  // Test successful initialization
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Clean up
+  pluginDestroy(context);
+  TEST_PASS();
+}
+
+// Test 2: Configuration file parsing - valid CSV
+int test_config_parsing_valid() {
+  const char* test_config =
+    "# Test configuration\n"
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
+    "# Comment line\n"
+    "\n"  // Empty line
+    "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
+
+  create_test_config("test_valid.conf", test_config);
+
+  // Set environment variable to use our test config
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_valid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 3: Configuration file parsing - invalid CSV
+int test_config_parsing_invalid() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1  # Missing nRanks and other fields\n"
+    "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
+    "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n";  // Invalid numbers
+
+  create_test_config("test_invalid.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  // Should still succeed but with no valid configs loaded
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_invalid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 4: Collective type matching
+int test_collective_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
+
+  create_test_config("test_match.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  // Create mock cost table
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+
+  // Test allreduce matching (should match first config)
+  ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                                          cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                                          0, &nChannels);
+
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 8, "Should set 8 channels");
+
+  // Test broadcast matching (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Reset costs
+    }
+  }
+
+  result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 4, "Should set 4 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_match.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 5: Size range matching
+int test_size_matching() {
+  const char* test_config =
+    "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
+    "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
+    "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
+
+  create_test_config("test_size.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+  int nChannels = 1;
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
+
+  // Test medium message (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
+
+  // Test large message (should match third config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_size.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 6: Topology matching
+int test_topology_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"      // Single node only
+    "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n"      // 4 nodes, 32 ranks exactly
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any topology
+
+  create_test_config("test_topo.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
+
+  // Test with single node setup
+  void* context1 = NULL;
+  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
+  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
+
+  pluginDestroy(context1);
+
+  // Test with 4 nodes, 32 ranks setup
+  void* context2 = NULL;
+  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
+  TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
+
+  // Clean up
+  unlink("test_topo.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 7: Default channels behavior (-1)
+int test_default_channels() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n";  // Use default channels
+
+  create_test_config("test_default.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels = 99;  // Set to known value
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
+  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_default.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 8: regBuff matching
+int test_regbuff_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n"      // Registered buffers only
+    "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n"      // Non-registered buffers only
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any buffer type (backward compatible)
+
+  create_test_config("test_regbuff.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test registered buffer (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    1, &nChannels);  // regBuff = 1 (registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
+
+  // Test non-registered buffer (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);  // regBuff = 0 (non-registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
+
+  // Test backward compatibility - config without regBuff should match any regBuff value
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  // First try with regBuff=2 (unusual value, should match third config)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    2, &nChannels);  // regBuff = 2 (only third config should match)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_regbuff.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 9: numPipeOps matching
+int test_pipeops_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n"      // Single pipeline op
+    "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n"      // Multiple pipeline ops
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any pipeline ops (backward compatible)
+
+  create_test_config("test_pipeops.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test single pipeline op (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
+
+  // Test multiple pipeline ops (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
+
+  // Test different number of pipeline ops (should match third config - backward compatible)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_pipeops.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 10: No matching configuration (fallback behavior)
+int test_no_match_fallback() {
+  const char* test_config =
+    "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n";  // Only broadcast config
+
+  create_test_config("test_fallback.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  // Try allreduce (should not match, use fallback)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
+  TEST_ASSERT(nChannels == 1, "Should use default channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_fallback.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 11: Large configuration files (testing dynamic allocation)
+int test_large_config() {
+  const char* large_config_file = "test_large.conf";
+
+  // Create a large configuration file with many entries
+  // This tests the dynamic allocation functionality
+  FILE* f = fopen(large_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create large config file");
+
+  // Write header comment
+  fprintf(f, "# Large configuration file for testing dynamic allocation\n");
+  fprintf(f, "# This file contains many configurations to test memory allocation\n");
+
+  // Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
+  const int num_configs = 500; // 5x the old static limit
+  const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
+  const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
+  const char* protocols[] = {"simple", "ll", "ll128"};
+
+  for (int i = 0; i < num_configs; i++) {
+    // Vary the configurations to create realistic test data
+    const char* coll = collectives[i % 5];
+    const char* algo = algorithms[i % 4];
+    const char* proto = protocols[i % 3];
+
+    size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
+    size_t max_bytes = min_bytes + 65536;    // 64KB range
+    int channels = (i % 8) + 1;              // 1-8 channels
+    int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
+    int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
+    int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
+    int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
+
+    fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
+            coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
+  }
+
+  fclose(f);
+
+  // Set environment to use our large config file
+  setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
+
+  // Initialize plugin with large config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Verify that configurations were loaded
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
+  TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
+  TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
+
+  // Test that we can access configurations throughout the array
+  // (This would have failed with the old static MAX_CONFIGS=100 limit)
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+    // Basic sanity checks on the loaded configurations
+    TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
+                "Collective type should be valid");
+    TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
+    TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
+  }
+
+  // Test specific configuration access at various indices
+  // Index 0 (first config)
+  TuningConfig* first_config = &ctx->configs[0];
+  TEST_ASSERT(first_config != NULL, "First config should be accessible");
+
+  // Index in middle
+  TuningConfig* mid_config = &ctx->configs[num_configs / 2];
+  TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
+
+  // Index near end (this would have crashed with static array of 100)
+  TuningConfig* late_config = &ctx->configs[num_configs - 1];
+  TEST_ASSERT(late_config != NULL, "Last config should be accessible");
+
+  // Test memory allocation size - verify we didn't over-allocate
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory allocated for %d configurations (%zu bytes total)",
+              ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
+
+  // Test that the plugin can still find matching configurations from the large set
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+  // Try to find a matching configuration - should work with large config set
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(large_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 12: Very large configuration stress test
+int test_very_large_config_stress() {
+  const char* stress_config_file = "test_stress.conf";
+
+  // Create an even larger configuration file to stress test the implementation
+  FILE* f = fopen(stress_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
+
+  fprintf(f, "# Stress test configuration with very large number of entries\n");
+
+  // Generate an extremely large number of configurations
+  const int stress_configs = 2000; // 20x the old static limit
+
+  for (int i = 0; i < stress_configs; i++) {
+    // Create varied but valid configurations
+    fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
+            i * 512, (i * 512) + 1024);
+  }
+
+  fclose(f);
+
+  setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
+
+  // Test initialization with stress config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
+  TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Stress test - loaded %d configurations successfully", stress_configs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory usage: %zu bytes for configuration array",
+              stress_configs * sizeof(TuningConfig));
+
+  // Verify we can access configurations throughout the entire range
+  for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
+    TuningConfig* config = &ctx->configs[i];
+    TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
+    TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
+  }
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(stress_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 13: Edge case - empty config file
+int test_empty_config() {
+  const char* empty_config_file = "test_empty.conf";
+
+  // Create empty config file (only comments)
+  create_test_config(empty_config_file,
+    "# Empty configuration file\n"
+    "# No actual configurations\n"
+    "\n"
+    "\n");
+
+  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
+  TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
+  TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
+
+  // Test that plugin still works with no configurations (fallback behavior)
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(empty_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test runner function pointer type
+typedef int (*TestFunction)(void);
+
+// Test registry
+typedef struct {
+  const char* name;
+  TestFunction func;
+  const char* description;
+} TestCase;
+
+// All available tests
+TestCase test_cases[] = {
+  {"init", test_plugin_init, "Plugin initialization"},
+  {"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
+  {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
+  {"collective", test_collective_matching, "Collective type matching"},
+  {"size", test_size_matching, "Size range matching"},
+  {"topology", test_topology_matching, "Topology matching"},
+  {"channels", test_default_channels, "Default channels behavior"},
+  {"regbuff", test_regbuff_matching, "Registered buffer matching"},
+  {"pipeops", test_pipeops_matching, "Pipeline operations matching"},
+  {"fallback", test_no_match_fallback, "Fallback behavior"},
+  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
+  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
+  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {NULL, NULL, NULL} // End marker
+};
+
+// Show help/usage information
+void show_help(const char* program_name) {
+  printf("Usage: %s [test_name ...]\n\n", program_name);
+  printf("Available tests:\n");
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    printf("  %-15s - %s\n", test_cases[i].name, test_cases[i].description);
+  }
+  printf("\nExamples:\n");
+  printf("  %s                    # Run all tests\n", program_name);
+  printf("  %s init               # Run only initialization test\n", program_name);
+  printf("  %s init collective    # Run initialization and collective tests\n", program_name);
+  printf("  %s --help             # Show this help\n", program_name);
+}
+
+// Find test by name
+TestFunction find_test(const char* name) {
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    if (strcmp(test_cases[i].name, name) == 0) {
+      return test_cases[i].func;
+    }
+  }
+  return NULL;
+}
+
+// Main test runner
+int main(int argc, char* argv[]) {
+  int passed = 0, total = 0;
+
+  // Check for help
+  if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
+    show_help(argv[0]);
+    return 0;
+  }
+
+  printf("Running NCCL Tuner Plugin Unit Tests\n");
+  printf("=====================================\n");
+
+  if (argc == 1) {
+    // No arguments - run all tests
+    for (int i = 0; test_cases[i].name != NULL; i++) {
+      total++;
+      passed += test_cases[i].func();
+    }
+  } else {
+    // Run specific tests
+    for (int arg = 1; arg < argc; arg++) {
+      TestFunction test_func = find_test(argv[arg]);
+      if (test_func) {
+        total++;
+        passed += test_func();
+      } else {
+        printf("ERROR: Unknown test '%s'\n", argv[arg]);
+        printf("Use --help to see available tests\n");
+        return 1;
+      }
+    }
+  }
+
+  printf("\n=====================================\n");
+  printf("Test Results: %d/%d tests passed\n", passed, total);
+
+  if (passed == total) {
+    printf("All tests PASSED!\n");
+    return 0;
+  } else {
+    printf("Some tests FAILED!\n");
+    return 1;
+  }
+}