updated benchmarks including thunderbolt and configuratuion guides

This commit is contained in:
Donato Capitella
2026-02-25 10:48:42 +00:00
parent 6875f62ccf
commit b035bcb482
26 changed files with 820 additions and 83 deletions
+134 -43
View File
@@ -3,62 +3,140 @@
# -------- dynamic config --------
HOST_ROCE="192.168.100.2"
HOST_ETH="192.168.1.127"
HOST_TB="192.168.2.2"
# Automatically detect local and remote RDMA device names
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
# Parse args
RUN_ETH=true
RUN_ROCE=true
RUN_TB=true
RUN_RDMA=true
# If any flags are provided, turn off defaults and only run requested
if [ "$#" -gt 0 ]; then
RUN_ETH=false
RUN_ROCE=false
RUN_TB=false
RUN_RDMA=false
fi
while getopts "ertih" opt; do
case ${opt} in
e ) RUN_ETH=true ;;
r ) RUN_ROCE=true ;;
t ) RUN_TB=true ;;
i ) RUN_RDMA=true ;;
h ) echo "Usage: $0 [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)]"
echo
echo "Options:"
echo " -e Run benchmarking for standard Ethernet (1G LAN)."
echo " -r Run benchmarking for RoCE NIC (via Ethernet/TCP)."
echo " -t Run benchmarking for Thunderbolt link."
echo " -i Run benchmarking for RDMA (RoCE v2)."
echo " -h Print this help message and exit."
echo
echo "If no arguments are provided, all benchmarks are executed."
exit 0
;;
\? ) echo "Usage: cmd [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)] [-h (Help)]"
exit 1
;;
esac
done
# Automatically detect local and remote RDMA device names if needed
if [ "$RUN_RDMA" = true ]; then
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
fi
WORKDIR="/tmp/rdma_bench"
mkdir -p "$WORKDIR"
# -------- helpers --------
parse_ping_avg() {
grep rtt "$1" | awk -F'/' '{print $5}'
if [ -f "$1" ]; then
grep rtt "$1" | awk -F'/' '{print $5}'
else
echo "0"
fi
}
parse_iperf_gbps() {
grep receiver "$1" | tail -n1 | awk '
{
val=$(NF-2);
unit=$(NF-1);
if (unit=="Mbits/sec") printf "%.2f", val/1000;
else if (unit=="Gbits/sec") printf "%.2f", val;
else print "N/A";
}'
if [ -f "$1" ]; then
grep receiver "$1" | tail -n1 | awk '
{
val=$(NF-2);
unit=$(NF-1);
if (unit=="Mbits/sec") printf "%.2f", val/1000;
else if (unit=="Gbits/sec") printf "%.2f", val;
else print "0.00";
}'
else
echo "0.00"
fi
}
parse_rdma_lat_us() {
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
echo "${val:-0}"
if [ -f "$1" ]; then
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
echo "${val:-0}"
else
echo "0"
fi
}
parse_rdma_bw_mib() {
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
echo "${val:-0}"
if [ -f "$1" ]; then
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
echo "${val:-0}"
else
echo "0"
fi
}
# -------- normal ethernet --------
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
# Clear old results
rm -f "$WORKDIR"/*.txt
# -------- roce ethernet (tcp) --------
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
if [ "$RUN_ETH" = true ]; then
# -------- normal ethernet --------
echo "[*] Benchmarking Ethernet (1G LAN)..."
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
fi
# -------- rdma latency --------
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
sleep 2
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
if [ "$RUN_ROCE" = true ]; then
# -------- roce ethernet (tcp) --------
echo "[*] Benchmarking RoCE NIC (Ethernet/TCP)..."
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
fi
# -------- rdma bandwidth (maximized) --------
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
sleep 2
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
if [ "$RUN_TB" = true ]; then
# -------- thunderbolt ethernet (tcp) --------
echo "[*] Benchmarking Thunderbolt..."
ping -c 10 "$HOST_TB" > "$WORKDIR/ping_tb.txt"
ssh "$HOST_TB" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_TB" -P 8 -t 10 > "$WORKDIR/iperf_tb.txt"
fi
if [ "$RUN_RDMA" = true ]; then
# -------- rdma latency --------
echo "[*] Benchmarking RDMA (RoCE v2)..."
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
sleep 2
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
# -------- rdma bandwidth (maximized) --------
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
sleep 2
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
fi
# -------- parse --------
ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
@@ -67,13 +145,17 @@ ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt")
ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt")
TB_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_tb.txt")
TB_BW=$(parse_iperf_gbps "$WORKDIR/iperf_tb.txt")
RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt")
# Convert units for dual display
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')")
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')")
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')")
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
TB_LAT_US=$(python3 -c "print(f'{float(${TB_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')" 2>/dev/null || echo "0.00")
RDMA_BW_GBPS=$(python3 - <<EOF
import sys
@@ -88,9 +170,18 @@ EOF
echo
echo "=== Network Comparison ==="
echo
printf "%-20s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
echo "----------------------------------------------------------------"
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS} ms" "${ETH_LAT_US} us" "${ETH_BW} Gbps"
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS} ms" "${ROCE_LAT_US} us" "${ROCE_BW} Gbps"
printf "%-20s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS} ms" "${RDMA_LAT_US} us" "${RDMA_BW_GBPS} Gbps"
printf "%-25s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
echo "-----------------------------------------------------------------------"
if [ "$RUN_ETH" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS:-0.00} ms" "${ETH_LAT_US:-0.00} us" "${ETH_BW:-0.00} Gbps"
fi
if [ "$RUN_ROCE" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS:-0.00} ms" "${ROCE_LAT_US:-0.00} us" "${ROCE_BW:-0.00} Gbps"
fi
if [ "$RUN_TB" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (Thunderbolt)" "${TB_LAT_MS:-0.00} ms" "${TB_LAT_US:-0.00} us" "${TB_BW:-0.00} Gbps"
fi
if [ "$RUN_RDMA" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS:-0.00} ms" "${RDMA_LAT_US:-0.00} us" "${RDMA_BW_GBPS:-0.00} Gbps"
fi
echo
+58
View File
@@ -333,3 +333,61 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
* **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
---
## 9. Alternative: Thunderbolt Networking
If you do not have dedicated 100GbE RDMA network cards, you can directly connect the two nodes using a high-quality **Thunderbolt 4 / USB4 cable**. This will create a `thunderbolt0` network interface.
While it lacks the ultra-low microprocessor-level latency of RDMA, it provides significantly more bandwidth than standard 1GbE/5GbE Ethernet and is easier to configure.
>**Note**: `thunderbolt-net` relies on standard OS kernel TCP/IP stacks.
### 9.1 Thunderbolt Configuration
**1. Establish Connection:**
Connect the nodes directly using a certified Thunderbolt 4 or USB4 cable. Verify the link is active:
```bash
ip link show thunderbolt0
```
**2. Network Configuration (Head - Node 1):**
Configure a persistent connection using `nmcli` with a static IP and Jumbo Frames (reduces CPU overhead).
*Note: Jumbo Frames may be unsupported on some Thunderbolt host controllers.*
```bash
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.1/24 mtu 9000
sudo nmcli connection up thunderbolt0
```
**3. Network Configuration (Worker - Node 2):**
```bash
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.2/24 mtu 9000
sudo nmcli connection up thunderbolt0
```
**4. Firewall Rules:**
To ensure Ray and NCCL can communicate freely over this link:
```bash
# Assign the interface to the trusted zone permanently
sudo firewall-cmd --permanent --zone=trusted --add-interface=thunderbolt0
sudo firewall-cmd --reload
```
### 9.2 Running vLLM over Thunderbolt
Our cluster scripts dynamically detect the network interface based on the provided IPs. There is no need to manually export environment variables!
1. Open the Toolbox: `toolbox enter vllm`
2. Launch the cluster manager: `start-vllm-cluster`
3. Select **Option 1 (Configure IPs)**.
4. Set the **Head IP** explicitly to `192.168.2.1` and the **Worker IP** to `192.168.2.2`.
5. Start the cluster normally (Option 2). The script will automatically discover and utilize `thunderbolt0` as the backend network for Ray orchestration and GPU synchronization.
### 9.3 Validating the Link
I have added Thunderbolt support to the `compare_eth_vs_rdma.sh` script. Run it from inside the toolbox to see the latency and bandwidth of your Thunderbolt link compared to your other network interfaces.
You can use the `-t` flag to ONLY benchmark the Thunderbolt connection (or `-e`, `-r`, `-i` for the others):
```bash
/opt/compare_eth_vs_rdma.sh -t
```