fix: improve RDMA stability by configuring NCCL IB timeout and retry count.

此提交包含在:
Donato Capitella
2026-02-01 22:04:34 +00:00
父節點 b458b287d0
當前提交 128ddade14
共有 2 個檔案被更改,包括 10 行新增0 行删除
+4
查看文件
@@ -181,6 +181,10 @@ def get_cluster_env():
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
return env
def get_model_args(model):
+6
查看文件
@@ -148,6 +148,9 @@ def setup_worker_node(worker_ip, head_ip):
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
"""
@@ -183,6 +186,9 @@ def setup_head_node(head_ip):
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Head on {head_ip}..."
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats
"""