From b035bcb482c2e12bf8a48ecabc660b6f9d55b76e Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Wed, 25 Feb 2026 10:48:42 +0000 Subject: [PATCH] updated benchmarks including thunderbolt and configuratuion guides --- ...4B-AWQ_cluster_tp2_eth_usb_throughput.json | 7 + ...q-4bit_cluster_tp2_eth_usb_throughput.json | 7 + ...q-8bit_cluster_tp2_eth_usb_throughput.json | 7 + ...nt4A16_cluster_tp2_eth_usb_throughput.json | 7 + ...12b-it_cluster_tp2_eth_usb_throughput.json | 7 + ...struct_cluster_tp2_eth_usb_throughput.json | 7 + ...T4-AWQ_cluster_tp2_eth_usb_throughput.json | 7 + ...s-120b_cluster_tp2_eth_usb_throughput.json | 7 + ...ss-20b_cluster_tp2_eth_usb_throughput.json | 7 + ...-Flash_cluster_tp2_eth_usb_throughput.json | 7 + ...4B-AWQ_cluster_tp2_eth_usb_throughput.json | 7 + ...q-4bit_cluster_tp2_eth_usb_throughput.json | 7 + ...q-8bit_cluster_tp2_eth_usb_throughput.json | 7 + ...nt4A16_cluster_tp2_eth_usb_throughput.json | 7 + ...12b-it_cluster_tp2_eth_usb_throughput.json | 7 + ...struct_cluster_tp2_eth_usb_throughput.json | 7 + ...T4-AWQ_cluster_tp2_eth_usb_throughput.json | 7 + ...s-120b_cluster_tp2_eth_usb_throughput.json | 7 + ...ss-20b_cluster_tp2_eth_usb_throughput.json | 7 + benchmarks/run_vllm_bench.py | 62 ++- benchmarks/vllm_cluster_bench.py | 71 +++- docs/index.html | 24 +- docs/parse_results.py | 20 + docs/results.json | 358 ++++++++++++++++++ rdma_cluster/compare_eth_vs_rdma.sh | 177 ++++++--- rdma_cluster/setup_guide.md | 58 +++ 26 files changed, 820 insertions(+), 83 deletions(-) create mode 100644 benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json create mode 100644 benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..d3975c1 --- /dev/null +++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 485.412814248004, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.4120204373051785, + "tokens_per_second": 302.43330149293365 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..905e886 --- /dev/null +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 503.28860085096676, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.397386310084984, + "tokens_per_second": 291.6914862601304 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..3ce6bba --- /dev/null +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 571.4193902639672, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.35000562355367393, + "tokens_per_second": 256.91287782898553 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..4992233 --- /dev/null +++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 824.4905019259895, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.2425740497104635, + "tokens_per_second": 178.05541683872298 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..47552cd --- /dev/null +++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 572.292031740013, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.3494719285046033, + "tokens_per_second": 260.10671430704866 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..1555d23 --- /dev/null +++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 284.23000320699066, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.7036554823325597, + "tokens_per_second": 513.235753981134 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..a02cef2 --- /dev/null +++ b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1474.255295659008, + "num_requests": 200, + "total_num_tokens": 146523, + "requests_per_second": 0.13566171380825723, + "tokens_per_second": 99.38780646163637 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..466cdc4 --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1338.8605944840237, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.1493807501871223, + "tokens_per_second": 109.82173992256857 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..d2ecc0b --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 282.0738571010297, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.7090341588386423, + "tokens_per_second": 521.2677328949931 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..8a4e731 --- /dev/null +++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 455.7690629530116, + "num_requests": 200, + "total_num_tokens": 146278, + "requests_per_second": 0.4388187269758136, + "tokens_per_second": 320.9476287228403 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..1c767d1 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 471.133652363962, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.4245079904534079, + "tokens_per_second": 311.59947769256274 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..3d44152 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 490.5911466999678, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.40767144157681784, + "tokens_per_second": 299.24102990342374 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..f35374f --- /dev/null +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 548.4156070559984, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.3646869225214776, + "tokens_per_second": 267.6893183038276 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..453e4e4 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 839.8958681730437, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.23812475757863116, + "tokens_per_second": 174.78952518165474 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..742b17b --- /dev/null +++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 434.26602390100015, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.46054719686197254, + "tokens_per_second": 342.7783704164132 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..2cf8b02 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 264.80451649799943, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.7552741269105618, + "tokens_per_second": 550.8856190566602 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..db17d6d --- /dev/null +++ b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1493.5249834020506, + "num_requests": 200, + "total_num_tokens": 146523, + "requests_per_second": 0.13391138562973798, + "tokens_per_second": 98.10548978313048 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..3fea00f --- /dev/null +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1320.7432732739835, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.1514298834959962, + "tokens_per_second": 111.32822174858649 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json new file mode 100644 index 0000000..2412ba3 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 282.87595599895576, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.707023682142636, + "tokens_per_second": 519.7896706376232 +} \ No newline at end of file diff --git a/benchmarks/run_vllm_bench.py b/benchmarks/run_vllm_bench.py index 35029de..8c335b3 100644 --- a/benchmarks/run_vllm_bench.py +++ b/benchmarks/run_vllm_bench.py @@ -179,31 +179,59 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI def print_summary(tps): - print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}") - print("-" * 75) + print(f"\n{'MODEL':<40} | {'TP':<2} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}") + print("-" * 92) for m in MODELS_TO_RUN: msafe = m.replace("/", "_") + name_cell = m.split('/')[-1] + for tp in tps: if tp not in MODEL_TABLE[m]["valid_tp"]: continue - # Default - try: - p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json" - d1 = json.loads(p1.read_text()) - val1 = f"{d1.get('tokens_per_second', 0):.1f}" - except: val1 = "N/A" + prefix = f"{msafe}_tp{tp}" - # ROCm - try: - p2 = Path("benchmark_results_rocm") / f"{msafe}_tp{tp}_throughput.json" - d2 = json.loads(p2.read_text()) - val2 = f"{d2.get('tokens_per_second', 0):.1f}" - except: val2 = "N/A" + tags = set() + for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"): + name_part = p.name[len(prefix):-len("_throughput.json")] + tag = name_part.lstrip("_") + tags.add(tag) + + for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"): + name_part = p.name[len(prefix):-len("_throughput.json")] + tag = name_part.lstrip("_") + tags.add(tag) + + if not tags: + tags.add("") # Default empty tag if no files found + + for tag in sorted(list(tags)): + tag_suffix = f"_{tag}" if tag else "" + + # Default + try: + p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json" + if p1.exists(): + d1 = json.loads(p1.read_text()) + val1 = f"{d1.get('tokens_per_second', 0):.1f}" + else: + val1 = "N/A" + except: val1 = "N/A" + + # ROCm + try: + p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json" + if p2.exists(): + d2 = json.loads(p2.read_text()) + val2 = f"{d2.get('tokens_per_second', 0):.1f}" + else: + val2 = "N/A" + except: val2 = "N/A" - name_cell = m.split('/')[-1] - print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}") - print("-" * 75) + display_tag = tag if tag else "(Default)" + print(f"{name_cell:<40} | {tp:<2} | {display_tag:<15} | {val1:<8} | {val2:<8}") + + print("-" * 92) if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py index 0ac8caf..2f2b2e2 100755 --- a/benchmarks/vllm_cluster_bench.py +++ b/benchmarks/vllm_cluster_bench.py @@ -213,7 +213,7 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None, overrides=Non log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...") - nuke_vllm_cache(HEAD_IP) + nuke_vllm_cache() cmd = ["vllm", "bench", "throughput"] + get_model_args(model, overrides) cmd.extend([ @@ -279,29 +279,60 @@ def run_cluster_throughput(model, overrides=None): def print_summary(): eth_suffix = "_eth" if FORCE_ETH else "" title_suffix = " (Ethernet ONLY)" if FORCE_ETH else "" - print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}") - print("-" * 75) + print(f"\n{f'MODEL (TP={CLUSTER_TP}){title_suffix}':<50} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}") + print("-" * 92) for m in MODELS_TO_RUN: msafe = m.replace("/", "_") - - # Default - try: - p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json" - d1 = json.loads(p1.read_text()) - val1 = f"{d1.get('tokens_per_second', 0):.1f}" - except: val1 = "N/A" - - # ROCm - try: - p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json" - d2 = json.loads(p2.read_text()) - val2 = f"{d2.get('tokens_per_second', 0):.1f}" - except: val2 = "N/A" - name_cell = m.split('/')[-1] - print(f"{name_cell:<50} | {val1:<8} | {val2:<8}") - print("-" * 75) + + # Find all tags used for this model by looking at the files in RESULTS_DIR + prefix = f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}" + + # Gather all unique tags from both directories + tags = set() + for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"): + # Extract tag: {prefix}_{tag}_throughput.json or {prefix}_throughput.json + name_part = p.name[len(prefix):-len("_throughput.json")] + tag = name_part.lstrip("_") + tags.add(tag) + + for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"): + name_part = p.name[len(prefix):-len("_throughput.json")] + tag = name_part.lstrip("_") + tags.add(tag) + + if not tags: + tags.add("") # Default empty tag if no files found + + # Sort so empty tag (Default) comes first + for tag in sorted(list(tags)): + tag_suffix = f"_{tag}" if tag else "" + + # Default (Triton) + try: + p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json" + if p1.exists(): + d1 = json.loads(p1.read_text()) + val1 = f"{d1.get('tokens_per_second', 0):.1f}" + else: + val1 = "N/A" + except: val1 = "N/A" + + # ROCm + try: + p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json" + if p2.exists(): + d2 = json.loads(p2.read_text()) + val2 = f"{d2.get('tokens_per_second', 0):.1f}" + else: + val2 = "N/A" + except: val2 = "N/A" + + display_tag = tag if tag else "(Default)" + print(f"{name_cell:<50} | {display_tag:<15} | {val1:<8} | {val2:<8}") + + print("-" * 92) if __name__ == "__main__": parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark") diff --git a/docs/index.html b/docs/index.html index 865cdd7..0a2e252 100644 --- a/docs/index.html +++ b/docs/index.html @@ -473,6 +473,10 @@ style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;"> TP2 (Eth) + @@ -549,6 +553,7 @@ showTP1: true, showTP2: true, showTP2Eth: false, + showTP2Usb: false, showTriton: true, showRocm: false }; @@ -621,6 +626,7 @@ $('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); }); $('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); }); $('toggleTP2Eth').addEventListener('change', e => { state.showTP2Eth = e.target.checked; render(); }); + $('toggleTP2Usb').addEventListener('change', e => { state.showTP2Usb = e.target.checked; render(); }); $('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); }); $('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); }); } @@ -643,15 +649,21 @@ results: { 1: { triton: null, rocm: null }, 2: { triton: null, rocm: null }, - "2_eth": { triton: null, rocm: null } + "2_eth": { triton: null, rocm: null }, + "2_usb": { triton: null, rocm: null } } }; } const m = testGroups[testName].models[modelName]; let tp = run.tp || 1; - if (tp === 2 && run.network === "Ethernet") { - tp = "2_eth"; + if (tp === 2) { + if (run.network === "Ethernet") { + if (run.tag === "usb") tp = "2_usb"; + else tp = "2_eth"; + } else if (run.tag === "usb") { + tp = "2_usb"; + } } if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null }; @@ -766,6 +778,10 @@ if (state.showTriton) cols.push({ id: "tp2_eth_triton", label: "TP2 Eth Triton" }); if (state.showRocm) cols.push({ id: "tp2_eth_rocm", label: "TP2 Eth ROCm" }); } + if (state.showTP2Usb) { + if (state.showTriton) cols.push({ id: "tp2_usb_triton", label: "TP2 TB Triton" }); + if (state.showRocm) cols.push({ id: "tp2_usb_rocm", label: "TP2 TB ROCm" }); + } // Thead let theadHtml = `Model`; @@ -835,6 +851,8 @@ if (colId === "tp2_rocm") return m.results[2]?.rocm; if (colId === "tp2_eth_triton") return m.results["2_eth"]?.triton; if (colId === "tp2_eth_rocm") return m.results["2_eth"]?.rocm; + if (colId === "tp2_usb_triton") return m.results["2_usb"]?.triton; + if (colId === "tp2_usb_rocm") return m.results["2_usb"]?.rocm; return null; } diff --git a/docs/parse_results.py b/docs/parse_results.py index 820e43f..352131d 100644 --- a/docs/parse_results.py +++ b/docs/parse_results.py @@ -68,8 +68,27 @@ def parse_logs(): # Network network = "RoCE" + network_prefix = "" if "_eth" in rest: network = "Ethernet" + network_prefix = "_eth" + + # Tag Extraction + tag = "" + test_type_str = "" + if "throughput" in fname: + test_type_str = "_throughput.json" + elif "latency" in fname: + qps_match = re.search(r"(_qps[\d\.]+)_latency\.json$", rest) + if qps_match: + test_type_str = qps_match.group(0) + else: + test_type_str = "_latency.json" + + raw_prefix = f"{tp}{network_prefix}" + if rest.endswith(test_type_str): + tag_part = rest[len(raw_prefix):-len(test_type_str)] + tag = tag_part.lstrip("_") # Model Name if "_" in model_part: @@ -93,6 +112,7 @@ def parse_logs(): "name_params_b": params_b, "backend": backend_name, # "Triton" or "ROCm" "network": network, + "tag": tag, "error": False } diff --git a/docs/results.json b/docs/results.json index 7eb8236..e55d002 100644 --- a/docs/results.json +++ b/docs/results.json @@ -10,6 +10,7 @@ "name_params_b": 8.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -25,6 +26,7 @@ "name_params_b": 12.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -40,6 +42,7 @@ "name_params_b": 14.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -55,6 +58,7 @@ "name_params_b": 20.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -70,6 +74,7 @@ "name_params_b": 120.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -85,6 +90,7 @@ "name_params_b": null, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -100,6 +106,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -115,6 +122,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -130,6 +138,7 @@ "name_params_b": 80.0, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -145,6 +154,7 @@ "name_params_b": null, "backend": "Triton", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -160,6 +170,7 @@ "name_params_b": 8.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -175,6 +186,7 @@ "name_params_b": 12.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -190,6 +202,7 @@ "name_params_b": 14.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -205,6 +218,7 @@ "name_params_b": 20.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -220,6 +234,7 @@ "name_params_b": 120.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -235,6 +250,7 @@ "name_params_b": null, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -250,6 +266,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -265,6 +282,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -280,6 +298,7 @@ "name_params_b": 80.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -295,6 +314,7 @@ "name_params_b": null, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -310,6 +330,7 @@ "name_params_b": 8.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -325,6 +346,7 @@ "name_params_b": 14.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -340,6 +362,7 @@ "name_params_b": 20.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -355,6 +378,7 @@ "name_params_b": 120.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -370,6 +394,7 @@ "name_params_b": null, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -385,6 +410,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -400,6 +426,7 @@ "name_params_b": 30.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -415,6 +442,7 @@ "name_params_b": 80.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -430,11 +458,172 @@ "name_params_b": 12.0, "backend": "Triton", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, "tps_mean": 159.95620436815713 }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 302.43330149293365 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 291.6914862601304 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 256.91287782898553 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 178.05541683872298 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 260.10671430704866 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 513.235753981134 + }, + { + "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 99.38780646163637 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 109.82173992256857 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 521.2677328949931 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "model_clean": "zai-org/GLM-4.7-Flash", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 320.9476287228403 + }, { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", @@ -445,6 +634,7 @@ "name_params_b": 8.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -460,6 +650,7 @@ "name_params_b": 12.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -475,6 +666,7 @@ "name_params_b": 14.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -490,6 +682,7 @@ "name_params_b": 20.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -505,6 +698,7 @@ "name_params_b": 120.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -520,6 +714,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -535,6 +730,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -550,6 +746,7 @@ "name_params_b": 80.0, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -565,6 +762,7 @@ "name_params_b": null, "backend": "ROCm", "network": "Ethernet", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -580,6 +778,7 @@ "name_params_b": 8.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -595,6 +794,7 @@ "name_params_b": 12.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -610,6 +810,7 @@ "name_params_b": 14.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -625,6 +826,7 @@ "name_params_b": 20.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -640,6 +842,7 @@ "name_params_b": 120.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -655,6 +858,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -670,6 +874,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -685,6 +890,7 @@ "name_params_b": 80.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 2, @@ -700,6 +906,7 @@ "name_params_b": 8.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -715,6 +922,7 @@ "name_params_b": 14.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -730,6 +938,7 @@ "name_params_b": 20.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -745,6 +954,7 @@ "name_params_b": 120.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -760,6 +970,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -775,6 +986,7 @@ "name_params_b": 30.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -790,6 +1002,7 @@ "name_params_b": 80.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, @@ -805,10 +1018,155 @@ "name_params_b": 12.0, "backend": "ROCm", "network": "RoCE", + "tag": "", "error": false, "test": "Throughput", "tp": 1, "tps_mean": 267.99881204205957 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 311.59947769256274 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 299.24102990342374 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 267.6893183038276 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 174.78952518165474 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 342.7783704164132 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 550.8856190566602 + }, + { + "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 98.10548978313048 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 111.32822174858649 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "ROCm", + "network": "Ethernet", + "tag": "usb", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 519.7896706376232 } ] } \ No newline at end of file diff --git a/rdma_cluster/compare_eth_vs_rdma.sh b/rdma_cluster/compare_eth_vs_rdma.sh index d4a5cc9..6956029 100755 --- a/rdma_cluster/compare_eth_vs_rdma.sh +++ b/rdma_cluster/compare_eth_vs_rdma.sh @@ -3,62 +3,140 @@ # -------- dynamic config -------- HOST_ROCE="192.168.100.2" HOST_ETH="192.168.1.127" +HOST_TB="192.168.2.2" -# Automatically detect local and remote RDMA device names -RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}') -RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'") +# Parse args +RUN_ETH=true +RUN_ROCE=true +RUN_TB=true +RUN_RDMA=true + +# If any flags are provided, turn off defaults and only run requested +if [ "$#" -gt 0 ]; then + RUN_ETH=false + RUN_ROCE=false + RUN_TB=false + RUN_RDMA=false +fi + +while getopts "ertih" opt; do + case ${opt} in + e ) RUN_ETH=true ;; + r ) RUN_ROCE=true ;; + t ) RUN_TB=true ;; + i ) RUN_RDMA=true ;; + h ) echo "Usage: $0 [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)]" + echo + echo "Options:" + echo " -e Run benchmarking for standard Ethernet (1G LAN)." + echo " -r Run benchmarking for RoCE NIC (via Ethernet/TCP)." + echo " -t Run benchmarking for Thunderbolt link." + echo " -i Run benchmarking for RDMA (RoCE v2)." + echo " -h Print this help message and exit." + echo + echo "If no arguments are provided, all benchmarks are executed." + exit 0 + ;; + \? ) echo "Usage: cmd [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)] [-h (Help)]" + exit 1 + ;; + esac +done + +# Automatically detect local and remote RDMA device names if needed +if [ "$RUN_RDMA" = true ]; then + RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}') + RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'") +fi WORKDIR="/tmp/rdma_bench" mkdir -p "$WORKDIR" # -------- helpers -------- parse_ping_avg() { - grep rtt "$1" | awk -F'/' '{print $5}' + if [ -f "$1" ]; then + grep rtt "$1" | awk -F'/' '{print $5}' + else + echo "0" + fi } parse_iperf_gbps() { - grep receiver "$1" | tail -n1 | awk ' - { - val=$(NF-2); - unit=$(NF-1); - if (unit=="Mbits/sec") printf "%.2f", val/1000; - else if (unit=="Gbits/sec") printf "%.2f", val; - else print "N/A"; - }' + if [ -f "$1" ]; then + grep receiver "$1" | tail -n1 | awk ' + { + val=$(NF-2); + unit=$(NF-1); + if (unit=="Mbits/sec") printf "%.2f", val/1000; + else if (unit=="Gbits/sec") printf "%.2f", val; + else print "0.00"; + }' + else + echo "0.00" + fi } parse_rdma_lat_us() { - val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}') - echo "${val:-0}" + if [ -f "$1" ]; then + val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}') + echo "${val:-0}" + else + echo "0" + fi } parse_rdma_bw_mib() { - val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}') - echo "${val:-0}" + if [ -f "$1" ]; then + val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}') + echo "${val:-0}" + else + echo "0" + fi } -# -------- normal ethernet -------- -ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt" -ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & -sleep 1 -iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt" +# Clear old results +rm -f "$WORKDIR"/*.txt -# -------- roce ethernet (tcp) -------- -ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt" -ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & -sleep 1 -iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt" +if [ "$RUN_ETH" = true ]; then + # -------- normal ethernet -------- + echo "[*] Benchmarking Ethernet (1G LAN)..." + ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt" + ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & + sleep 1 + iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt" +fi -# -------- rdma latency -------- -ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 & -sleep 2 -ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1 +if [ "$RUN_ROCE" = true ]; then + # -------- roce ethernet (tcp) -------- + echo "[*] Benchmarking RoCE NIC (Ethernet/TCP)..." + ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt" + ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & + sleep 1 + iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt" +fi -# -------- rdma bandwidth (maximized) -------- -# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1 -ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 & -sleep 2 -ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1 +if [ "$RUN_TB" = true ]; then + # -------- thunderbolt ethernet (tcp) -------- + echo "[*] Benchmarking Thunderbolt..." + ping -c 10 "$HOST_TB" > "$WORKDIR/ping_tb.txt" + ssh "$HOST_TB" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 & + sleep 1 + iperf3 -c "$HOST_TB" -P 8 -t 10 > "$WORKDIR/iperf_tb.txt" +fi + +if [ "$RUN_RDMA" = true ]; then + # -------- rdma latency -------- + echo "[*] Benchmarking RDMA (RoCE v2)..." + ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 & + sleep 2 + ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1 + + # -------- rdma bandwidth (maximized) -------- + # We use -x 1 because show_gids confirmed RoCE v2 is at Index 1 + ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 & + sleep 2 + ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1 +fi # -------- parse -------- ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt") @@ -67,13 +145,17 @@ ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt") ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt") ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt") +TB_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_tb.txt") +TB_BW=$(parse_iperf_gbps "$WORKDIR/iperf_tb.txt") + RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt") RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt") # Convert units for dual display -ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')") -ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')") -RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')") +ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00") +ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00") +TB_LAT_US=$(python3 -c "print(f'{float(${TB_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00") +RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')" 2>/dev/null || echo "0.00") RDMA_BW_GBPS=$(python3 - <**Note**: `thunderbolt-net` relies on standard OS kernel TCP/IP stacks. + +### 9.1 Thunderbolt Configuration + +**1. Establish Connection:** +Connect the nodes directly using a certified Thunderbolt 4 or USB4 cable. Verify the link is active: +```bash +ip link show thunderbolt0 +``` + +**2. Network Configuration (Head - Node 1):** +Configure a persistent connection using `nmcli` with a static IP and Jumbo Frames (reduces CPU overhead). +*Note: Jumbo Frames may be unsupported on some Thunderbolt host controllers.* +```bash +sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.1/24 mtu 9000 +sudo nmcli connection up thunderbolt0 +``` + +**3. Network Configuration (Worker - Node 2):** +```bash +sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.2/24 mtu 9000 +sudo nmcli connection up thunderbolt0 +``` + +**4. Firewall Rules:** +To ensure Ray and NCCL can communicate freely over this link: +```bash +# Assign the interface to the trusted zone permanently +sudo firewall-cmd --permanent --zone=trusted --add-interface=thunderbolt0 +sudo firewall-cmd --reload +``` + +### 9.2 Running vLLM over Thunderbolt + +Our cluster scripts dynamically detect the network interface based on the provided IPs. There is no need to manually export environment variables! + +1. Open the Toolbox: `toolbox enter vllm` +2. Launch the cluster manager: `start-vllm-cluster` +3. Select **Option 1 (Configure IPs)**. +4. Set the **Head IP** explicitly to `192.168.2.1` and the **Worker IP** to `192.168.2.2`. +5. Start the cluster normally (Option 2). The script will automatically discover and utilize `thunderbolt0` as the backend network for Ray orchestration and GPU synchronization. + +### 9.3 Validating the Link +I have added Thunderbolt support to the `compare_eth_vs_rdma.sh` script. Run it from inside the toolbox to see the latency and bandwidth of your Thunderbolt link compared to your other network interfaces. + +You can use the `-t` flag to ONLY benchmark the Thunderbolt connection (or `-e`, `-r`, `-i` for the others): +```bash +/opt/compare_eth_vs_rdma.sh -t +```