From a1105a0b9664a441c0e11387b6c3d66db4a96bd3 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 1 Feb 2026 19:36:07 +0000 Subject: [PATCH] feat: Enhance vLLM benchmarking to compare Triton and ROCm attention, introduce a new script for cluster configuration, and update Dockerfile for new tools and dependencies. --- .gitignore | 1 + Dockerfile | 13 ++- benchmarks/max_context_results.json | 144 ++++++++++++++++++++++++++++ benchmarks/run_vllm_bench.py | 53 +++++++--- benchmarks/vllm_cluster_bench.py | 82 ++++++++++------ docs/index.html | 7 +- docs/parse_results.py | 6 +- rdma_cluster/compare_eth_vs_rdma.sh | 32 ++++--- scripts/configure_cluster.sh | 135 ++++++++++++++++++++++++++ 9 files changed, 410 insertions(+), 63 deletions(-) create mode 100644 .gitignore create mode 100755 scripts/configure_cluster.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 9799be9..d73084c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -130,7 +130,11 @@ COPY scripts/start_vllm_cluster.py /usr/local/bin/start-vllm-cluster COPY benchmarks/max_context_results.json /opt/max_context_results.json COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py -RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod 0644 /opt/max_context_results.json +COPY benchmarks/find_max_context.py /opt/find_max_context.py +COPY rdma_cluster/compare_eth_vs_rdma.sh /opt/compare_eth_vs_rdma.sh +COPY scripts/configure_cluster.sh /opt/configure_cluster.sh +RUN chmod +x /opt/configure_cluster.sh +RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod +x /opt/compare_eth_vs_rdma.sh && chmod +x /opt/find_max_context.py && chmod 0644 /opt/max_context_results.json RUN chmod 0644 /etc/profile.d/*.sh RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh @@ -145,4 +149,11 @@ RUN echo "Installing Custom RCCL..." && \ find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \ rm /tmp/librccl.so.1 +# 10. Force Upgrade Transformers (User Override) +# Required for GLM Flash. vLLM reports incompatibility with transformers >= 5, +# but this version (5.0.0) has been tested and confirmed working. +RUN python -m pip install transformers==5.0.0 + +RUN chmod -R a+rwX /opt + CMD ["/bin/bash"] diff --git a/benchmarks/max_context_results.json b/benchmarks/max_context_results.json index 461cceb..cd17079 100644 --- a/benchmarks/max_context_results.json +++ b/benchmarks/max_context_results.json @@ -329,5 +329,149 @@ "real_capacity": 0, "status": "fail", "error": "Verification Failed" + }, + { + "model": "zai-org/GLM-4.7-Flash", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 202752, + "configured_len": 162201, + "real_capacity": 791088, + "status": "success", + "error": "Timeout", + "max_context_1_user": 162201 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 202752, + "configured_len": 162201, + "real_capacity": 791056, + "status": "success", + "error": "", + "max_context_1_user": 162201 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 202752, + "configured_len": 162201, + "real_capacity": 791008, + "status": "success", + "error": "", + "max_context_1_user": 162201 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 202752, + "configured_len": 162201, + "real_capacity": 790992, + "status": "success", + "error": "", + "max_context_1_user": 162201 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 1089152, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 1089072, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 1089072, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 1089056, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 262144, + "configured_len": 209715, + "real_capacity": 936544, + "status": "success", + "error": "Timeout", + "max_context_1_user": 209715 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 262144, + "configured_len": 209715, + "real_capacity": 937312, + "status": "success", + "error": "", + "max_context_1_user": 209715 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 262144, + "configured_len": 209715, + "real_capacity": 937296, + "status": "success", + "error": "", + "max_context_1_user": 209715 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 262144, + "configured_len": 209715, + "real_capacity": 937280, + "status": "success", + "error": "", + "max_context_1_user": 209715 } ] \ No newline at end of file diff --git a/benchmarks/run_vllm_bench.py b/benchmarks/run_vllm_bench.py index 3ca45b0..14ba0be 100644 --- a/benchmarks/run_vllm_bench.py +++ b/benchmarks/run_vllm_bench.py @@ -76,7 +76,7 @@ MODEL_TABLE = { "max_tokens": "32768" }, - # 5. Qwen 80B AWQ (The Big One) [NEW] + # 5. Qwen 80B AWQ # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB. # Config: 20k ctx fits in that cache. Eager mode required for stability. "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { @@ -176,14 +176,17 @@ def get_model_args(model, tp_size): return cmd -def run_throughput(model, tp_size): +def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None): if tp_size not in MODEL_TABLE[model]["valid_tp"]: return model_safe = model.replace("/", "_") - output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json" + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + + output_file = output_dir_path / f"{model_safe}_tp{tp_size}_throughput.json" if output_file.exists(): - log(f"SKIP Throughput {model} (TP={tp_size})") + log(f"SKIP {model} (TP={tp_size} | {backend_name})") return dataset_path = get_dataset() @@ -192,7 +195,7 @@ def run_throughput(model, tp_size): # Retrieve Model-Specific Batch Tokens batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS) - log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...") + log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...") kill_vllm() nuke_vllm_cache() @@ -212,31 +215,43 @@ def run_throughput(model, tp_size): # Inject model specific env vars (e.g. for AWQ) model_env = MODEL_TABLE[model].get("env", {}) env.update(model_env) + + # Extra Env + if extra_env: + env.update(extra_env) try: subprocess.run(cmd, check=True, env=env) except: - log(f"ERROR: Throughput failed {model}") - + log(f"ERROR: Failed {model} [{backend_name}]") def print_summary(tps): - print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}") - print("-" * 60) + print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}") + print("-" * 75) for m in MODELS_TO_RUN: msafe = m.replace("/", "_") for tp in tps: if tp not in MODEL_TABLE[m]["valid_tp"]: continue + # Default try: - tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text()) - tok_s = f"{tdata.get('tokens_per_second', 0):.1f}" - except: tok_s = "N/A" + p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json" + d1 = json.loads(p1.read_text()) + val1 = f"{d1.get('tokens_per_second', 0):.1f}" + except: val1 = "N/A" + + # ROCm + try: + p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_tp{tp}_throughput.json" + d2 = json.loads(p2.read_text()) + val2 = f"{d2.get('tokens_per_second', 0):.1f}" + except: val2 = "N/A" name_cell = m.split('/')[-1] - print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}") - print("-" * 60) + print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}") + print("-" * 75) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -254,5 +269,13 @@ if __name__ == "__main__": kill_vllm() for tp in valid_tp_args: for m in MODELS_TO_RUN: - run_throughput(m, tp) + # 1. Default (Triton) + run_throughput(m, tp, "Default", RESULTS_DIR) + + # 2. ROCm Attention + run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm_attn/benchmark_results", { + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1", + "VLLM_USE_TRITON_FLASH_ATTN": "0" + }) + print_summary(valid_tp_args) diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py index 5e48394..3a15aee 100755 --- a/benchmarks/vllm_cluster_bench.py +++ b/benchmarks/vllm_cluster_bench.py @@ -152,22 +152,21 @@ def get_model_args(model): if config.get("trust_remote"): cmd.append("--trust-remote-code") - # FORCED EAGER as per request for cluster stability - cmd.append("--enforce-eager") + # Respect config for Eager Mode (Apple-to-Apples with TP=1) + if config.get("enforce_eager"): + cmd.append("--enforce-eager") return cmd -def run_cluster_throughput(model): - # Skip if TP=2 is not valid for this model - if CLUSTER_TP not in MODEL_TABLE[model]["valid_tp"]: - log(f"SKIP {model} (Support TP={MODEL_TABLE[model]['valid_tp']}, Cluster is TP={CLUSTER_TP})") - return - +def run_bench_set(model, backend_name, output_dir, extra_env=None): model_safe = model.replace("/", "_") - output_file = RESULTS_DIR / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json" + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + + output_file = output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json" if output_file.exists(): - log(f"SKIP {model} (Result exists)") + log(f"SKIP {model} [{backend_name}] (Result exists)") return dataset_path = get_dataset() @@ -175,14 +174,10 @@ def run_cluster_throughput(model): batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS) - log(f"START Cluster Bench {model} [TP={CLUSTER_TP} | Eager=True]...") + log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...") - # Nuke cache between runs to be safe nuke_vllm_cache() - # NOTE: We use 'vllm bench throughput' directly. - # It supports most 'vllm serve' args but we need to ensure it picks up the ray backend. - cmd = ["vllm", "bench", "throughput"] + get_model_args(model) cmd.extend([ "--num-prompts", str(OFF_NUM_PROMPTS), @@ -195,43 +190,72 @@ def run_cluster_throughput(model): env = get_cluster_env() - # Add model specific envs + # Model specific envs model_env = MODEL_TABLE[model].get("env", {}) env.update(model_env) + + # Run specific envs (e.g. ROCm attention) + if extra_env: + env.update(extra_env) try: log(f"Command: {' '.join(cmd)}") subprocess.run(cmd, check=True, env=env) except subprocess.CalledProcessError as e: - log(f"ERROR: Cluster Benchmark failed for {model} (Exit {e.returncode})") + log(f"ERROR: Failed {model} [{backend_name}] (Exit {e.returncode})") except Exception as e: log(f"ERROR: System error: {e}") +def run_cluster_throughput(model): + # 1. Default Run (Triton usually, unless global envs set) + run_bench_set( + model, + "Default", + RESULTS_DIR + ) + + # 2. ROCm Attention Run + run_bench_set( + model, + "ROCm-Attn", + "benchmark_results_rocm_attn/benchmark_results", + extra_env={ + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1", + "VLLM_USE_TRITON_FLASH_ATTN": "0" + } + ) + def print_summary(): - print(f"\n{'MODEL (Cluster TP=2)':<50} | {'TOK/S':<10}") - print("-" * 65) + print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}") + print("-" * 75) for m in MODELS_TO_RUN: msafe = m.replace("/", "_") + + # Default try: - tdata = json.loads((RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json").read_text()) - tok_s = f"{tdata.get('tokens_per_second', 0):.1f}" - except: - if CLUSTER_TP not in MODEL_TABLE[m]["valid_tp"]: - tok_s = "SKIP" - else: - tok_s = "N/A" + p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json" + d1 = json.loads(p1.read_text()) + val1 = f"{d1.get('tokens_per_second', 0):.1f}" + except: val1 = "N/A" + + # ROCm + try: + p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json" + d2 = json.loads(p2.read_text()) + val2 = f"{d2.get('tokens_per_second', 0):.1f}" + except: val2 = "N/A" name_cell = m.split('/')[-1] - print(f"{name_cell:<50} | {tok_s:<10}") - print("-" * 65) + print(f"{name_cell:<50} | {val1:<8} | {val2:<8}") + print("-" * 75) if __name__ == "__main__": if not check_ray_status(): log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.") sys.exit(1) - log("Ray Cluster Detected. Starting Benchmarks...") + log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...") for m in MODELS_TO_RUN: run_cluster_throughput(m) diff --git a/docs/index.html b/docs/index.html index 0d9a343..09e0d3f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -551,8 +551,11 @@ // Sort tests: Throughput first, then others alphabetically tests.sort((a, b) => { - if (a.name === "Throughput") return -1; - if (b.name === "Throughput") return 1; + const aTp = a.name.startsWith("Throughput"); + const bTp = b.name.startsWith("Throughput"); + + if (aTp && !bTp) return -1; + if (!aTp && bTp) return 1; return a.name.localeCompare(b.name); }); diff --git a/docs/parse_results.py b/docs/parse_results.py index 83e4cb9..771c65f 100644 --- a/docs/parse_results.py +++ b/docs/parse_results.py @@ -89,7 +89,7 @@ def parse_logs(): if "throughput" in fname: tps = data.get("tokens_per_second", 0) run = base_run.copy() - run["test"] = "Throughput" + run["test"] = f"Throughput (TP{tp})" run["tps_mean"] = tps if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump run["error"] = True @@ -111,13 +111,13 @@ def parse_logs(): # TTFT r1 = base_run.copy() - r1["test"] = f"TTFT @ QPS {qps}" + r1["test"] = f"TTFT (TP{tp}) @ QPS {qps}" r1["tps_mean"] = ttft runs.append(r1) # TPOT r2 = base_run.copy() - r2["test"] = f"TPOT @ QPS {qps}" + r2["test"] = f"TPOT (TP{tp}) @ QPS {qps}" r2["tps_mean"] = tpot runs.append(r2) diff --git a/rdma_cluster/compare_eth_vs_rdma.sh b/rdma_cluster/compare_eth_vs_rdma.sh index bf9330e..d4a5cc9 100755 --- a/rdma_cluster/compare_eth_vs_rdma.sh +++ b/rdma_cluster/compare_eth_vs_rdma.sh @@ -6,7 +6,7 @@ HOST_ETH="192.168.1.127" # Automatically detect local and remote RDMA device names RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}') -RDMA_DEV_REMOTE=$(ssh fw2 "ibv_devices | awk 'NR==3 {print \$1}'") +RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'") WORKDIR="/tmp/rdma_bench" mkdir -p "$WORKDIR" @@ -39,36 +39,42 @@ parse_rdma_bw_mib() { # -------- normal ethernet -------- ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt" -ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 & +ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & sleep 1 iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt" # -------- roce ethernet (tcp) -------- ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt" -ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 & +ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & sleep 1 iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt" # -------- rdma latency -------- -ssh fw2 "ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 & +ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 & sleep 2 ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1 # -------- rdma bandwidth (maximized) -------- # We use -x 1 because show_gids confirmed RoCE v2 is at Index 1 -ssh fw2 "ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 & +ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 & sleep 2 ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1 # -------- parse -------- -ETH_LAT=$(parse_ping_avg "$WORKDIR/ping_eth.txt") +ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt") ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt") -ROCE_LAT=$(parse_ping_avg "$WORKDIR/ping_roce.txt") +ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt") ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt") -RDMA_LAT=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt") +RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt") RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt") + +# Convert units for dual display +ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')") +ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')") +RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')") + RDMA_BW_GBPS=$(python3 - < Set Head Node IP (default: 192.168.100.1)" + echo " --worker-ip Set Worker Node IP (default: 192.168.100.2)" + echo " -h, --help Show this help message" + exit 1 +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + head|worker|run-vllm) + MODE="$1" + shift + ;; + --head-ip) + HEAD_IP="$2" + shift 2 + ;; + --worker-ip) + WORKER_IP="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +if [ -z "$MODE" ]; then + usage +fi + +setup_head() { + echo "Configuring Head Node..." + ray stop --force + + # Critical Config + export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + export RAY_memory_monitor_refresh_ms=0 + export VLLM_HOST_IP=$HEAD_IP + # Dynamic interface detection based on subnet of HEAD_IP + SUBNET=$(echo $HEAD_IP | awk -F. '{print $1"."$2"."$3".0/24"}') + export RDMA_IFACE=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1) + + if [ -z "$RDMA_IFACE" ]; then + echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0." + export RDMA_IFACE="eth0" + fi + + export NCCL_SOCKET_IFNAME=$RDMA_IFACE + export GLOO_SOCKET_IFNAME=$RDMA_IFACE + + echo "Starting Ray Head on $HEAD_IP (Interface: $RDMA_IFACE)..." + ray start --head --port=6379 --node-ip-address=$HEAD_IP --num-gpus=1 --num-cpus=8 --disable-usage-stats +} + +setup_worker() { + echo "Configuring Worker Node..." + ray stop --force + + # Critical Config + export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + export RAY_memory_monitor_refresh_ms=0 + export VLLM_HOST_IP=$WORKER_IP + # Dynamic interface detection based on subnet of WORKER_IP + SUBNET=$(echo $WORKER_IP | awk -F. '{print $1"."$2"."$3".0/24"}') + export RDMA_IFACE=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1) + + if [ -z "$RDMA_IFACE" ]; then + echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0." + export RDMA_IFACE="eth0" + fi + + export NCCL_SOCKET_IFNAME=$RDMA_IFACE + export GLOO_SOCKET_IFNAME=$RDMA_IFACE + + echo "Starting Ray Worker on $WORKER_IP connecting to $HEAD_IP:6379 (Interface: $RDMA_IFACE)..." + ray start --address="$HEAD_IP:6379" --num-gpus=1 --num-cpus=8 --disable-usage-stats +} + +run_vllm() { + echo "Running vLLM..." + + export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + export VLLM_HOST_IP=$HEAD_IP + + # Dynamic interface detection based on subnet of HEAD_IP + SUBNET=$(echo $HEAD_IP | awk -F. '{print $1"."$2"."$3".0/24"}') + export NCCL_SOCKET_IFNAME=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1) + + if [ -z "$NCCL_SOCKET_IFNAME" ]; then + echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0." + export NCCL_SOCKET_IFNAME="eth0" + fi + + export NCCL_IB_GID_INDEX=1 + export NCCL_IB_DISABLE=0 + export NCCL_NET_GDR_LEVEL=0 + + echo "Launching vLLM Serve..." + vllm serve facebook/opt-125m \ + --tensor-parallel-size 2 \ + --distributed-executor-backend ray \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.90 +} + +# Execute Mode +if [ "$MODE" == "head" ]; then + setup_head +elif [ "$MODE" == "worker" ]; then + setup_worker +elif [ "$MODE" == "run-vllm" ]; then + run_vllm +fi