feat: Enhance vLLM benchmarking to compare Triton and ROCm attention, introduce a new script for cluster configuration, and update Dockerfile for new tools and dependencies.
This commit is contained in:
@@ -0,0 +1 @@
|
||||
*.pyc
|
||||
+12
-1
@@ -130,7 +130,11 @@ COPY scripts/start_vllm_cluster.py /usr/local/bin/start-vllm-cluster
|
||||
COPY benchmarks/max_context_results.json /opt/max_context_results.json
|
||||
COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py
|
||||
COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py
|
||||
RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod 0644 /opt/max_context_results.json
|
||||
COPY benchmarks/find_max_context.py /opt/find_max_context.py
|
||||
COPY rdma_cluster/compare_eth_vs_rdma.sh /opt/compare_eth_vs_rdma.sh
|
||||
COPY scripts/configure_cluster.sh /opt/configure_cluster.sh
|
||||
RUN chmod +x /opt/configure_cluster.sh
|
||||
RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod +x /opt/compare_eth_vs_rdma.sh && chmod +x /opt/find_max_context.py && chmod 0644 /opt/max_context_results.json
|
||||
RUN chmod 0644 /etc/profile.d/*.sh
|
||||
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
||||
|
||||
@@ -145,4 +149,11 @@ RUN echo "Installing Custom RCCL..." && \
|
||||
find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \
|
||||
rm /tmp/librccl.so.1
|
||||
|
||||
# 10. Force Upgrade Transformers (User Override)
|
||||
# Required for GLM Flash. vLLM reports incompatibility with transformers >= 5,
|
||||
# but this version (5.0.0) has been tested and confirmed working.
|
||||
RUN python -m pip install transformers==5.0.0
|
||||
|
||||
RUN chmod -R a+rwX /opt
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -329,5 +329,149 @@
|
||||
"real_capacity": 0,
|
||||
"status": "fail",
|
||||
"error": "Verification Failed"
|
||||
},
|
||||
{
|
||||
"model": "zai-org/GLM-4.7-Flash",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 1,
|
||||
"model_limit": 202752,
|
||||
"configured_len": 162201,
|
||||
"real_capacity": 791088,
|
||||
"status": "success",
|
||||
"error": "Timeout",
|
||||
"max_context_1_user": 162201
|
||||
},
|
||||
{
|
||||
"model": "zai-org/GLM-4.7-Flash",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 4,
|
||||
"model_limit": 202752,
|
||||
"configured_len": 162201,
|
||||
"real_capacity": 791056,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 162201
|
||||
},
|
||||
{
|
||||
"model": "zai-org/GLM-4.7-Flash",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 8,
|
||||
"model_limit": 202752,
|
||||
"configured_len": 162201,
|
||||
"real_capacity": 791008,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 162201
|
||||
},
|
||||
{
|
||||
"model": "zai-org/GLM-4.7-Flash",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 16,
|
||||
"model_limit": 202752,
|
||||
"configured_len": 162201,
|
||||
"real_capacity": 790992,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 162201
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 1,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 262144,
|
||||
"real_capacity": 1089152,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 262144
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 4,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 262144,
|
||||
"real_capacity": 1089072,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 262144
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 8,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 262144,
|
||||
"real_capacity": 1089072,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 262144
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 16,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 262144,
|
||||
"real_capacity": 1089056,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 262144
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 1,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 209715,
|
||||
"real_capacity": 936544,
|
||||
"status": "success",
|
||||
"error": "Timeout",
|
||||
"max_context_1_user": 209715
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 4,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 209715,
|
||||
"real_capacity": 937312,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 209715
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 8,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 209715,
|
||||
"real_capacity": 937296,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 209715
|
||||
},
|
||||
{
|
||||
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||
"tp": 1,
|
||||
"util": "0.95",
|
||||
"max_seqs": 16,
|
||||
"model_limit": 262144,
|
||||
"configured_len": 209715,
|
||||
"real_capacity": 937280,
|
||||
"status": "success",
|
||||
"error": "",
|
||||
"max_context_1_user": 209715
|
||||
}
|
||||
]
|
||||
@@ -76,7 +76,7 @@ MODEL_TABLE = {
|
||||
"max_tokens": "32768"
|
||||
},
|
||||
|
||||
# 5. Qwen 80B AWQ (The Big One) [NEW]
|
||||
# 5. Qwen 80B AWQ
|
||||
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
|
||||
# Config: 20k ctx fits in that cache. Eager mode required for stability.
|
||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
||||
@@ -176,14 +176,17 @@ def get_model_args(model, tp_size):
|
||||
|
||||
return cmd
|
||||
|
||||
def run_throughput(model, tp_size):
|
||||
def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None):
|
||||
if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
|
||||
|
||||
model_safe = model.replace("/", "_")
|
||||
output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json"
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir_path / f"{model_safe}_tp{tp_size}_throughput.json"
|
||||
|
||||
if output_file.exists():
|
||||
log(f"SKIP Throughput {model} (TP={tp_size})")
|
||||
log(f"SKIP {model} (TP={tp_size} | {backend_name})")
|
||||
return
|
||||
|
||||
dataset_path = get_dataset()
|
||||
@@ -192,7 +195,7 @@ def run_throughput(model, tp_size):
|
||||
# Retrieve Model-Specific Batch Tokens
|
||||
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
|
||||
log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...")
|
||||
log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...")
|
||||
kill_vllm()
|
||||
nuke_vllm_cache()
|
||||
|
||||
@@ -212,31 +215,43 @@ def run_throughput(model, tp_size):
|
||||
# Inject model specific env vars (e.g. for AWQ)
|
||||
model_env = MODEL_TABLE[model].get("env", {})
|
||||
env.update(model_env)
|
||||
|
||||
# Extra Env
|
||||
if extra_env:
|
||||
env.update(extra_env)
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, env=env)
|
||||
except:
|
||||
log(f"ERROR: Throughput failed {model}")
|
||||
|
||||
log(f"ERROR: Failed {model} [{backend_name}]")
|
||||
|
||||
|
||||
def print_summary(tps):
|
||||
print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}")
|
||||
print("-" * 60)
|
||||
print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 75)
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
msafe = m.replace("/", "_")
|
||||
for tp in tps:
|
||||
if tp not in MODEL_TABLE[m]["valid_tp"]: continue
|
||||
|
||||
# Default
|
||||
try:
|
||||
tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text())
|
||||
tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
|
||||
except: tok_s = "N/A"
|
||||
p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json"
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_tp{tp}_throughput.json"
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
except: val2 = "N/A"
|
||||
|
||||
name_cell = m.split('/')[-1]
|
||||
print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}")
|
||||
print("-" * 60)
|
||||
print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}")
|
||||
print("-" * 75)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
@@ -254,5 +269,13 @@ if __name__ == "__main__":
|
||||
kill_vllm()
|
||||
for tp in valid_tp_args:
|
||||
for m in MODELS_TO_RUN:
|
||||
run_throughput(m, tp)
|
||||
# 1. Default (Triton)
|
||||
run_throughput(m, tp, "Default", RESULTS_DIR)
|
||||
|
||||
# 2. ROCm Attention
|
||||
run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm_attn/benchmark_results", {
|
||||
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1",
|
||||
"VLLM_USE_TRITON_FLASH_ATTN": "0"
|
||||
})
|
||||
|
||||
print_summary(valid_tp_args)
|
||||
|
||||
@@ -152,22 +152,21 @@ def get_model_args(model):
|
||||
|
||||
if config.get("trust_remote"): cmd.append("--trust-remote-code")
|
||||
|
||||
# FORCED EAGER as per request for cluster stability
|
||||
cmd.append("--enforce-eager")
|
||||
# Respect config for Eager Mode (Apple-to-Apples with TP=1)
|
||||
if config.get("enforce_eager"):
|
||||
cmd.append("--enforce-eager")
|
||||
|
||||
return cmd
|
||||
|
||||
def run_cluster_throughput(model):
|
||||
# Skip if TP=2 is not valid for this model
|
||||
if CLUSTER_TP not in MODEL_TABLE[model]["valid_tp"]:
|
||||
log(f"SKIP {model} (Support TP={MODEL_TABLE[model]['valid_tp']}, Cluster is TP={CLUSTER_TP})")
|
||||
return
|
||||
|
||||
def run_bench_set(model, backend_name, output_dir, extra_env=None):
|
||||
model_safe = model.replace("/", "_")
|
||||
output_file = RESULTS_DIR / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
|
||||
if output_file.exists():
|
||||
log(f"SKIP {model} (Result exists)")
|
||||
log(f"SKIP {model} [{backend_name}] (Result exists)")
|
||||
return
|
||||
|
||||
dataset_path = get_dataset()
|
||||
@@ -175,14 +174,10 @@ def run_cluster_throughput(model):
|
||||
|
||||
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
|
||||
log(f"START Cluster Bench {model} [TP={CLUSTER_TP} | Eager=True]...")
|
||||
log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...")
|
||||
|
||||
# Nuke cache between runs to be safe
|
||||
nuke_vllm_cache()
|
||||
|
||||
# NOTE: We use 'vllm bench throughput' directly.
|
||||
# It supports most 'vllm serve' args but we need to ensure it picks up the ray backend.
|
||||
|
||||
cmd = ["vllm", "bench", "throughput"] + get_model_args(model)
|
||||
cmd.extend([
|
||||
"--num-prompts", str(OFF_NUM_PROMPTS),
|
||||
@@ -195,43 +190,72 @@ def run_cluster_throughput(model):
|
||||
|
||||
env = get_cluster_env()
|
||||
|
||||
# Add model specific envs
|
||||
# Model specific envs
|
||||
model_env = MODEL_TABLE[model].get("env", {})
|
||||
env.update(model_env)
|
||||
|
||||
# Run specific envs (e.g. ROCm attention)
|
||||
if extra_env:
|
||||
env.update(extra_env)
|
||||
|
||||
try:
|
||||
log(f"Command: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True, env=env)
|
||||
except subprocess.CalledProcessError as e:
|
||||
log(f"ERROR: Cluster Benchmark failed for {model} (Exit {e.returncode})")
|
||||
log(f"ERROR: Failed {model} [{backend_name}] (Exit {e.returncode})")
|
||||
except Exception as e:
|
||||
log(f"ERROR: System error: {e}")
|
||||
|
||||
def run_cluster_throughput(model):
|
||||
# 1. Default Run (Triton usually, unless global envs set)
|
||||
run_bench_set(
|
||||
model,
|
||||
"Default",
|
||||
RESULTS_DIR
|
||||
)
|
||||
|
||||
# 2. ROCm Attention Run
|
||||
run_bench_set(
|
||||
model,
|
||||
"ROCm-Attn",
|
||||
"benchmark_results_rocm_attn/benchmark_results",
|
||||
extra_env={
|
||||
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1",
|
||||
"VLLM_USE_TRITON_FLASH_ATTN": "0"
|
||||
}
|
||||
)
|
||||
|
||||
def print_summary():
|
||||
print(f"\n{'MODEL (Cluster TP=2)':<50} | {'TOK/S':<10}")
|
||||
print("-" * 65)
|
||||
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 75)
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
msafe = m.replace("/", "_")
|
||||
|
||||
# Default
|
||||
try:
|
||||
tdata = json.loads((RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json").read_text())
|
||||
tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
|
||||
except:
|
||||
if CLUSTER_TP not in MODEL_TABLE[m]["valid_tp"]:
|
||||
tok_s = "SKIP"
|
||||
else:
|
||||
tok_s = "N/A"
|
||||
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
except: val2 = "N/A"
|
||||
|
||||
name_cell = m.split('/')[-1]
|
||||
print(f"{name_cell:<50} | {tok_s:<10}")
|
||||
print("-" * 65)
|
||||
print(f"{name_cell:<50} | {val1:<8} | {val2:<8}")
|
||||
print("-" * 75)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not check_ray_status():
|
||||
log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
|
||||
sys.exit(1)
|
||||
|
||||
log("Ray Cluster Detected. Starting Benchmarks...")
|
||||
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
run_cluster_throughput(m)
|
||||
|
||||
@@ -551,8 +551,11 @@
|
||||
|
||||
// Sort tests: Throughput first, then others alphabetically
|
||||
tests.sort((a, b) => {
|
||||
if (a.name === "Throughput") return -1;
|
||||
if (b.name === "Throughput") return 1;
|
||||
const aTp = a.name.startsWith("Throughput");
|
||||
const bTp = b.name.startsWith("Throughput");
|
||||
|
||||
if (aTp && !bTp) return -1;
|
||||
if (!aTp && bTp) return 1;
|
||||
return a.name.localeCompare(b.name);
|
||||
});
|
||||
|
||||
|
||||
@@ -89,7 +89,7 @@ def parse_logs():
|
||||
if "throughput" in fname:
|
||||
tps = data.get("tokens_per_second", 0)
|
||||
run = base_run.copy()
|
||||
run["test"] = "Throughput"
|
||||
run["test"] = f"Throughput (TP{tp})"
|
||||
run["tps_mean"] = tps
|
||||
if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump
|
||||
run["error"] = True
|
||||
@@ -111,13 +111,13 @@ def parse_logs():
|
||||
|
||||
# TTFT
|
||||
r1 = base_run.copy()
|
||||
r1["test"] = f"TTFT @ QPS {qps}"
|
||||
r1["test"] = f"TTFT (TP{tp}) @ QPS {qps}"
|
||||
r1["tps_mean"] = ttft
|
||||
runs.append(r1)
|
||||
|
||||
# TPOT
|
||||
r2 = base_run.copy()
|
||||
r2["test"] = f"TPOT @ QPS {qps}"
|
||||
r2["test"] = f"TPOT (TP{tp}) @ QPS {qps}"
|
||||
r2["tps_mean"] = tpot
|
||||
runs.append(r2)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ HOST_ETH="192.168.1.127"
|
||||
|
||||
# Automatically detect local and remote RDMA device names
|
||||
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
|
||||
RDMA_DEV_REMOTE=$(ssh fw2 "ibv_devices | awk 'NR==3 {print \$1}'")
|
||||
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
|
||||
|
||||
WORKDIR="/tmp/rdma_bench"
|
||||
mkdir -p "$WORKDIR"
|
||||
@@ -39,36 +39,42 @@ parse_rdma_bw_mib() {
|
||||
|
||||
# -------- normal ethernet --------
|
||||
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
|
||||
ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 &
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
|
||||
|
||||
# -------- roce ethernet (tcp) --------
|
||||
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
|
||||
ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 &
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
|
||||
|
||||
# -------- rdma latency --------
|
||||
ssh fw2 "ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
|
||||
|
||||
# -------- rdma bandwidth (maximized) --------
|
||||
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
|
||||
ssh fw2 "ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
|
||||
|
||||
# -------- parse --------
|
||||
ETH_LAT=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
|
||||
ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
|
||||
ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt")
|
||||
|
||||
ROCE_LAT=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
|
||||
ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
|
||||
ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt")
|
||||
|
||||
RDMA_LAT=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
|
||||
RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
|
||||
RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt")
|
||||
|
||||
# Convert units for dual display
|
||||
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')")
|
||||
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')")
|
||||
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')")
|
||||
|
||||
RDMA_BW_GBPS=$(python3 - <<EOF
|
||||
import sys
|
||||
try:
|
||||
@@ -82,9 +88,9 @@ EOF
|
||||
echo
|
||||
echo "=== Network Comparison ==="
|
||||
echo
|
||||
printf "%-20s %-12s %-12s\n" "Path" "Latency" "Bandwidth"
|
||||
echo "------------------------------------------------"
|
||||
printf "%-20s %-12s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT} ms" "${ETH_BW} Gbps"
|
||||
printf "%-20s %-12s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT} ms" "${ROCE_BW} Gbps"
|
||||
printf "%-20s %-12s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT} us" "${RDMA_BW_GBPS} Gbps"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
|
||||
echo "----------------------------------------------------------------"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS} ms" "${ETH_LAT_US} us" "${ETH_BW} Gbps"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS} ms" "${ROCE_LAT_US} us" "${ROCE_BW} Gbps"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS} ms" "${RDMA_LAT_US} us" "${RDMA_BW_GBPS} Gbps"
|
||||
echo
|
||||
|
||||
Executable
+135
@@ -0,0 +1,135 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Defaults
|
||||
HEAD_IP="192.168.100.1"
|
||||
WORKER_IP="192.168.100.2"
|
||||
MODE=""
|
||||
|
||||
# Help function
|
||||
usage() {
|
||||
echo "Usage: $0 [mode] [options]"
|
||||
echo "Modes:"
|
||||
echo " head Configure and start Ray head node"
|
||||
echo " worker Configure and start Ray worker node"
|
||||
echo " run-vllm Run vLLM serve"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --head-ip <ip> Set Head Node IP (default: 192.168.100.1)"
|
||||
echo " --worker-ip <ip> Set Worker Node IP (default: 192.168.100.2)"
|
||||
echo " -h, --help Show this help message"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
head|worker|run-vllm)
|
||||
MODE="$1"
|
||||
shift
|
||||
;;
|
||||
--head-ip)
|
||||
HEAD_IP="$2"
|
||||
shift 2
|
||||
;;
|
||||
--worker-ip)
|
||||
WORKER_IP="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$MODE" ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
setup_head() {
|
||||
echo "Configuring Head Node..."
|
||||
ray stop --force
|
||||
|
||||
# Critical Config
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP=$HEAD_IP
|
||||
# Dynamic interface detection based on subnet of HEAD_IP
|
||||
SUBNET=$(echo $HEAD_IP | awk -F. '{print $1"."$2"."$3".0/24"}')
|
||||
export RDMA_IFACE=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1)
|
||||
|
||||
if [ -z "$RDMA_IFACE" ]; then
|
||||
echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0."
|
||||
export RDMA_IFACE="eth0"
|
||||
fi
|
||||
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
|
||||
echo "Starting Ray Head on $HEAD_IP (Interface: $RDMA_IFACE)..."
|
||||
ray start --head --port=6379 --node-ip-address=$HEAD_IP --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
}
|
||||
|
||||
setup_worker() {
|
||||
echo "Configuring Worker Node..."
|
||||
ray stop --force
|
||||
|
||||
# Critical Config
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP=$WORKER_IP
|
||||
# Dynamic interface detection based on subnet of WORKER_IP
|
||||
SUBNET=$(echo $WORKER_IP | awk -F. '{print $1"."$2"."$3".0/24"}')
|
||||
export RDMA_IFACE=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1)
|
||||
|
||||
if [ -z "$RDMA_IFACE" ]; then
|
||||
echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0."
|
||||
export RDMA_IFACE="eth0"
|
||||
fi
|
||||
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
|
||||
echo "Starting Ray Worker on $WORKER_IP connecting to $HEAD_IP:6379 (Interface: $RDMA_IFACE)..."
|
||||
ray start --address="$HEAD_IP:6379" --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
}
|
||||
|
||||
run_vllm() {
|
||||
echo "Running vLLM..."
|
||||
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export VLLM_HOST_IP=$HEAD_IP
|
||||
|
||||
# Dynamic interface detection based on subnet of HEAD_IP
|
||||
SUBNET=$(echo $HEAD_IP | awk -F. '{print $1"."$2"."$3".0/24"}')
|
||||
export NCCL_SOCKET_IFNAME=$(ip -o addr show to $SUBNET | awk '{print $2}' | head -n1)
|
||||
|
||||
if [ -z "$NCCL_SOCKET_IFNAME" ]; then
|
||||
echo "Warning: Could not detect interface for $SUBNET. Defaulting to eth0."
|
||||
export NCCL_SOCKET_IFNAME="eth0"
|
||||
fi
|
||||
|
||||
export NCCL_IB_GID_INDEX=1
|
||||
export NCCL_IB_DISABLE=0
|
||||
export NCCL_NET_GDR_LEVEL=0
|
||||
|
||||
echo "Launching vLLM Serve..."
|
||||
vllm serve facebook/opt-125m \
|
||||
--tensor-parallel-size 2 \
|
||||
--distributed-executor-backend ray \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90
|
||||
}
|
||||
|
||||
# Execute Mode
|
||||
if [ "$MODE" == "head" ]; then
|
||||
setup_head
|
||||
elif [ "$MODE" == "worker" ]; then
|
||||
setup_worker
|
||||
elif [ "$MODE" == "run-vllm" ]; then
|
||||
run_vllm
|
||||
fi
|
||||
Reference in New Issue
Block a user