From 128ddade14f4ad9ded59d40069093679a910a748 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 1 Feb 2026 22:04:34 +0000 Subject: [PATCH] fix: improve RDMA stability by configuring NCCL IB timeout and retry count. --- benchmarks/vllm_cluster_bench.py | 4 ++++ scripts/start_vllm_cluster.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py index 26c2f62..56e4db7 100755 --- a/benchmarks/vllm_cluster_bench.py +++ b/benchmarks/vllm_cluster_bench.py @@ -181,6 +181,10 @@ def get_cluster_env(): env["NCCL_IB_DISABLE"] = "0" env["NCCL_NET_GDR_LEVEL"] = "0" + # Stability for RDMA (Fix for high-throughput models like Gemma 3) + env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s) + env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks + return env def get_model_args(model): diff --git a/scripts/start_vllm_cluster.py b/scripts/start_vllm_cluster.py index 9fcfb7f..d6bb561 100755 --- a/scripts/start_vllm_cluster.py +++ b/scripts/start_vllm_cluster.py @@ -148,6 +148,9 @@ def setup_worker_node(worker_ip, head_ip): export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1) export NCCL_SOCKET_IFNAME=$RDMA_IFACE export GLOO_SOCKET_IFNAME=$RDMA_IFACE + # Stability for RDMA + export NCCL_IB_TIMEOUT=23 + export NCCL_IB_RETRY_CNT=7 echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..." ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats """ @@ -183,6 +186,9 @@ def setup_head_node(head_ip): export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1) export NCCL_SOCKET_IFNAME=$RDMA_IFACE export GLOO_SOCKET_IFNAME=$RDMA_IFACE + # Stability for RDMA + export NCCL_IB_TIMEOUT=23 + export NCCL_IB_RETRY_CNT=7 echo "Starting Ray Head on {head_ip}..." ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats """