From b035bcb482c2e12bf8a48ecabc660b6f9d55b76e Mon Sep 17 00:00:00 2001
From: Donato Capitella <donato.capitella@reversec.com>
Date: Wed, 25 Feb 2026 10:48:42 +0000
Subject: [PATCH] updated benchmarks including thunderbolt and configuratuion
 guides

---
 ...4B-AWQ_cluster_tp2_eth_usb_throughput.json |   7 +
 ...q-4bit_cluster_tp2_eth_usb_throughput.json |   7 +
 ...q-8bit_cluster_tp2_eth_usb_throughput.json |   7 +
 ...nt4A16_cluster_tp2_eth_usb_throughput.json |   7 +
 ...12b-it_cluster_tp2_eth_usb_throughput.json |   7 +
 ...struct_cluster_tp2_eth_usb_throughput.json |   7 +
 ...T4-AWQ_cluster_tp2_eth_usb_throughput.json |   7 +
 ...s-120b_cluster_tp2_eth_usb_throughput.json |   7 +
 ...ss-20b_cluster_tp2_eth_usb_throughput.json |   7 +
 ...-Flash_cluster_tp2_eth_usb_throughput.json |   7 +
 ...4B-AWQ_cluster_tp2_eth_usb_throughput.json |   7 +
 ...q-4bit_cluster_tp2_eth_usb_throughput.json |   7 +
 ...q-8bit_cluster_tp2_eth_usb_throughput.json |   7 +
 ...nt4A16_cluster_tp2_eth_usb_throughput.json |   7 +
 ...12b-it_cluster_tp2_eth_usb_throughput.json |   7 +
 ...struct_cluster_tp2_eth_usb_throughput.json |   7 +
 ...T4-AWQ_cluster_tp2_eth_usb_throughput.json |   7 +
 ...s-120b_cluster_tp2_eth_usb_throughput.json |   7 +
 ...ss-20b_cluster_tp2_eth_usb_throughput.json |   7 +
 benchmarks/run_vllm_bench.py                  |  62 ++-
 benchmarks/vllm_cluster_bench.py              |  71 +++-
 docs/index.html                               |  24 +-
 docs/parse_results.py                         |  20 +
 docs/results.json                             | 358 ++++++++++++++++++
 rdma_cluster/compare_eth_vs_rdma.sh           | 177 ++++++---
 rdma_cluster/setup_guide.md                   |  58 +++
 26 files changed, 820 insertions(+), 83 deletions(-)
 create mode 100644 benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
 create mode 100644 benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json

diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..d3975c1
--- /dev/null
+++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 485.412814248004,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.4120204373051785,
+    "tokens_per_second": 302.43330149293365
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..905e886
--- /dev/null
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 503.28860085096676,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.397386310084984,
+    "tokens_per_second": 291.6914862601304
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..3ce6bba
--- /dev/null
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 571.4193902639672,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.35000562355367393,
+    "tokens_per_second": 256.91287782898553
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..4992233
--- /dev/null
+++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 824.4905019259895,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.2425740497104635,
+    "tokens_per_second": 178.05541683872298
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..47552cd
--- /dev/null
+++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 572.292031740013,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.3494719285046033,
+    "tokens_per_second": 260.10671430704866
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..1555d23
--- /dev/null
+++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 284.23000320699066,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.7036554823325597,
+    "tokens_per_second": 513.235753981134
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..a02cef2
--- /dev/null
+++ b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1474.255295659008,
+    "num_requests": 200,
+    "total_num_tokens": 146523,
+    "requests_per_second": 0.13566171380825723,
+    "tokens_per_second": 99.38780646163637
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..466cdc4
--- /dev/null
+++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1338.8605944840237,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.1493807501871223,
+    "tokens_per_second": 109.82173992256857
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..d2ecc0b
--- /dev/null
+++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 282.0738571010297,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.7090341588386423,
+    "tokens_per_second": 521.2677328949931
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..8a4e731
--- /dev/null
+++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 455.7690629530116,
+    "num_requests": 200,
+    "total_num_tokens": 146278,
+    "requests_per_second": 0.4388187269758136,
+    "tokens_per_second": 320.9476287228403
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..1c767d1
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 471.133652363962,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.4245079904534079,
+    "tokens_per_second": 311.59947769256274
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..3d44152
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 490.5911466999678,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.40767144157681784,
+    "tokens_per_second": 299.24102990342374
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..f35374f
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 548.4156070559984,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.3646869225214776,
+    "tokens_per_second": 267.6893183038276
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..453e4e4
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 839.8958681730437,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.23812475757863116,
+    "tokens_per_second": 174.78952518165474
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..742b17b
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 434.26602390100015,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.46054719686197254,
+    "tokens_per_second": 342.7783704164132
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..2cf8b02
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 264.80451649799943,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.7552741269105618,
+    "tokens_per_second": 550.8856190566602
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..db17d6d
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1493.5249834020506,
+    "num_requests": 200,
+    "total_num_tokens": 146523,
+    "requests_per_second": 0.13391138562973798,
+    "tokens_per_second": 98.10548978313048
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..3fea00f
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1320.7432732739835,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.1514298834959962,
+    "tokens_per_second": 111.32822174858649
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json
new file mode 100644
index 0000000..2412ba3
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_usb_throughput.json
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 282.87595599895576,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.707023682142636,
+    "tokens_per_second": 519.7896706376232
+}
\ No newline at end of file
diff --git a/benchmarks/run_vllm_bench.py b/benchmarks/run_vllm_bench.py
index 35029de..8c335b3 100644
--- a/benchmarks/run_vllm_bench.py
+++ b/benchmarks/run_vllm_bench.py
@@ -179,31 +179,59 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
 
 
 def print_summary(tps):
-    print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}")
-    print("-" * 75)
+    print(f"\n{'MODEL':<40} | {'TP':<2} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
+    print("-" * 92)
     
     for m in MODELS_TO_RUN:
         msafe = m.replace("/", "_")
+        name_cell = m.split('/')[-1]
+        
         for tp in tps:
             if tp not in MODEL_TABLE[m]["valid_tp"]: continue
             
-            # Default
-            try: 
-                p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json"
-                d1 = json.loads(p1.read_text())
-                val1 = f"{d1.get('tokens_per_second', 0):.1f}"
-            except: val1 = "N/A"
+            prefix = f"{msafe}_tp{tp}"
             
-            # ROCm
-            try:
-                p2 = Path("benchmark_results_rocm") / f"{msafe}_tp{tp}_throughput.json"
-                d2 = json.loads(p2.read_text())
-                val2 = f"{d2.get('tokens_per_second', 0):.1f}"
-            except: val2 = "N/A"
+            tags = set()
+            for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
+                name_part = p.name[len(prefix):-len("_throughput.json")]
+                tag = name_part.lstrip("_")
+                tags.add(tag)
+                
+            for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
+                name_part = p.name[len(prefix):-len("_throughput.json")]
+                tag = name_part.lstrip("_")
+                tags.add(tag)
+                
+            if not tags:
+                tags.add("") # Default empty tag if no files found
+                
+            for tag in sorted(list(tags)):
+                tag_suffix = f"_{tag}" if tag else ""
+                
+                # Default
+                try: 
+                    p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
+                    if p1.exists():
+                        d1 = json.loads(p1.read_text())
+                        val1 = f"{d1.get('tokens_per_second', 0):.1f}"
+                    else:
+                        val1 = "N/A"
+                except: val1 = "N/A"
+                
+                # ROCm
+                try:
+                    p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
+                    if p2.exists():
+                        d2 = json.loads(p2.read_text())
+                        val2 = f"{d2.get('tokens_per_second', 0):.1f}"
+                    else:
+                        val2 = "N/A"
+                except: val2 = "N/A"
 
-            name_cell = m.split('/')[-1]
-            print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}")
-    print("-" * 75)
+                display_tag = tag if tag else "(Default)"
+                print(f"{name_cell:<40} | {tp:<2} | {display_tag:<15} | {val1:<8} | {val2:<8}")
+                
+    print("-" * 92)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py
index 0ac8caf..2f2b2e2 100755
--- a/benchmarks/vllm_cluster_bench.py
+++ b/benchmarks/vllm_cluster_bench.py
@@ -213,7 +213,7 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None, overrides=Non
 
     log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...")
     
-    nuke_vllm_cache(HEAD_IP)
+    nuke_vllm_cache()
 
     cmd = ["vllm", "bench", "throughput"] + get_model_args(model, overrides)
     cmd.extend([
@@ -279,29 +279,60 @@ def run_cluster_throughput(model, overrides=None):
 def print_summary():
     eth_suffix = "_eth" if FORCE_ETH else ""
     title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
-    print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}")
-    print("-" * 75)
+    print(f"\n{f'MODEL (TP={CLUSTER_TP}){title_suffix}':<50} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
+    print("-" * 92)
     
     for m in MODELS_TO_RUN:
         msafe = m.replace("/", "_")
-        
-        # Default
-        try: 
-            p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
-            d1 = json.loads(p1.read_text())
-            val1 = f"{d1.get('tokens_per_second', 0):.1f}"
-        except: val1 = "N/A"
-        
-        # ROCm
-        try:
-            p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
-            d2 = json.loads(p2.read_text())
-            val2 = f"{d2.get('tokens_per_second', 0):.1f}"
-        except: val2 = "N/A"
-
         name_cell = m.split('/')[-1]
-        print(f"{name_cell:<50} | {val1:<8} | {val2:<8}")
-    print("-" * 75)
+        
+        # Find all tags used for this model by looking at the files in RESULTS_DIR
+        prefix = f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}"
+        
+        # Gather all unique tags from both directories
+        tags = set()
+        for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
+            # Extract tag: {prefix}_{tag}_throughput.json or {prefix}_throughput.json
+            name_part = p.name[len(prefix):-len("_throughput.json")]
+            tag = name_part.lstrip("_")
+            tags.add(tag)
+            
+        for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
+            name_part = p.name[len(prefix):-len("_throughput.json")]
+            tag = name_part.lstrip("_")
+            tags.add(tag)
+            
+        if not tags:
+            tags.add("") # Default empty tag if no files found
+            
+        # Sort so empty tag (Default) comes first
+        for tag in sorted(list(tags)):
+            tag_suffix = f"_{tag}" if tag else ""
+            
+            # Default (Triton)
+            try: 
+                p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
+                if p1.exists():
+                    d1 = json.loads(p1.read_text())
+                    val1 = f"{d1.get('tokens_per_second', 0):.1f}"
+                else:
+                    val1 = "N/A"
+            except: val1 = "N/A"
+            
+            # ROCm
+            try:
+                p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
+                if p2.exists():
+                    d2 = json.loads(p2.read_text())
+                    val2 = f"{d2.get('tokens_per_second', 0):.1f}"
+                else:
+                    val2 = "N/A"
+            except: val2 = "N/A"
+
+            display_tag = tag if tag else "(Default)"
+            print(f"{name_cell:<50} | {display_tag:<15} | {val1:<8} | {val2:<8}")
+            
+    print("-" * 92)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
diff --git a/docs/index.html b/docs/index.html
index 865cdd7..0a2e252 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -473,6 +473,10 @@
                     style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
                     <input type="checkbox" id="toggleTP2Eth"> TP2 (Eth)
                 </label>
+                <label
+                    style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
+                    <input type="checkbox" id="toggleTP2Usb"> TP2 (Thunderbolt)
+                </label>
             </div>
 
             <!-- Attention Group -->
@@ -549,6 +553,7 @@
             showTP1: true,
             showTP2: true,
             showTP2Eth: false,
+            showTP2Usb: false,
             showTriton: true,
             showRocm: false
         };
@@ -621,6 +626,7 @@
             $('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); });
             $('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); });
             $('toggleTP2Eth').addEventListener('change', e => { state.showTP2Eth = e.target.checked; render(); });
+            $('toggleTP2Usb').addEventListener('change', e => { state.showTP2Usb = e.target.checked; render(); });
             $('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); });
             $('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); });
         }
@@ -643,15 +649,21 @@
                         results: {
                             1: { triton: null, rocm: null },
                             2: { triton: null, rocm: null },
-                            "2_eth": { triton: null, rocm: null }
+                            "2_eth": { triton: null, rocm: null },
+                            "2_usb": { triton: null, rocm: null }
                         }
                     };
                 }
 
                 const m = testGroups[testName].models[modelName];
                 let tp = run.tp || 1;
-                if (tp === 2 && run.network === "Ethernet") {
-                    tp = "2_eth";
+                if (tp === 2) {
+                    if (run.network === "Ethernet") {
+                        if (run.tag === "usb") tp = "2_usb";
+                        else tp = "2_eth";
+                    } else if (run.tag === "usb") {
+                        tp = "2_usb";
+                    }
                 }
 
                 if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null };
@@ -766,6 +778,10 @@
                 if (state.showTriton) cols.push({ id: "tp2_eth_triton", label: "TP2 Eth Triton" });
                 if (state.showRocm) cols.push({ id: "tp2_eth_rocm", label: "TP2 Eth ROCm" });
             }
+            if (state.showTP2Usb) {
+                if (state.showTriton) cols.push({ id: "tp2_usb_triton", label: "TP2 TB Triton" });
+                if (state.showRocm) cols.push({ id: "tp2_usb_rocm", label: "TP2 TB ROCm" });
+            }
 
             // Thead
             let theadHtml = `<thead><tr><th class="col-model">Model</th>`;
@@ -835,6 +851,8 @@
             if (colId === "tp2_rocm") return m.results[2]?.rocm;
             if (colId === "tp2_eth_triton") return m.results["2_eth"]?.triton;
             if (colId === "tp2_eth_rocm") return m.results["2_eth"]?.rocm;
+            if (colId === "tp2_usb_triton") return m.results["2_usb"]?.triton;
+            if (colId === "tp2_usb_rocm") return m.results["2_usb"]?.rocm;
             return null;
         }
 
diff --git a/docs/parse_results.py b/docs/parse_results.py
index 820e43f..352131d 100644
--- a/docs/parse_results.py
+++ b/docs/parse_results.py
@@ -68,8 +68,27 @@ def parse_logs():
             
             # Network
             network = "RoCE"
+            network_prefix = ""
             if "_eth" in rest:
                 network = "Ethernet"
+                network_prefix = "_eth"
+                
+            # Tag Extraction
+            tag = ""
+            test_type_str = ""
+            if "throughput" in fname:
+                test_type_str = "_throughput.json"
+            elif "latency" in fname:
+                qps_match = re.search(r"(_qps[\d\.]+)_latency\.json$", rest)
+                if qps_match:
+                    test_type_str = qps_match.group(0)
+                else:
+                    test_type_str = "_latency.json"
+            
+            raw_prefix = f"{tp}{network_prefix}"
+            if rest.endswith(test_type_str):
+                tag_part = rest[len(raw_prefix):-len(test_type_str)]
+                tag = tag_part.lstrip("_")
             
             # Model Name
             if "_" in model_part:
@@ -93,6 +112,7 @@ def parse_logs():
                 "name_params_b": params_b,
                 "backend": backend_name, # "Triton" or "ROCm"
                 "network": network,
+                "tag": tag,
                 "error": False
             }
 
diff --git a/docs/results.json b/docs/results.json
index 7eb8236..e55d002 100644
--- a/docs/results.json
+++ b/docs/results.json
@@ -10,6 +10,7 @@
       "name_params_b": 8.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -25,6 +26,7 @@
       "name_params_b": 12.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -40,6 +42,7 @@
       "name_params_b": 14.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -55,6 +58,7 @@
       "name_params_b": 20.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -70,6 +74,7 @@
       "name_params_b": 120.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -85,6 +90,7 @@
       "name_params_b": null,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -100,6 +106,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -115,6 +122,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -130,6 +138,7 @@
       "name_params_b": 80.0,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -145,6 +154,7 @@
       "name_params_b": null,
       "backend": "Triton",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -160,6 +170,7 @@
       "name_params_b": 8.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -175,6 +186,7 @@
       "name_params_b": 12.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -190,6 +202,7 @@
       "name_params_b": 14.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -205,6 +218,7 @@
       "name_params_b": 20.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -220,6 +234,7 @@
       "name_params_b": 120.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -235,6 +250,7 @@
       "name_params_b": null,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -250,6 +266,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -265,6 +282,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -280,6 +298,7 @@
       "name_params_b": 80.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -295,6 +314,7 @@
       "name_params_b": null,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -310,6 +330,7 @@
       "name_params_b": 8.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -325,6 +346,7 @@
       "name_params_b": 14.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -340,6 +362,7 @@
       "name_params_b": 20.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -355,6 +378,7 @@
       "name_params_b": 120.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -370,6 +394,7 @@
       "name_params_b": null,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -385,6 +410,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -400,6 +426,7 @@
       "name_params_b": 30.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -415,6 +442,7 @@
       "name_params_b": 80.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -430,11 +458,172 @@
       "name_params_b": 12.0,
       "backend": "Triton",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
       "tps_mean": 159.95620436815713
     },
+    {
+      "model": "Qwen/Qwen3-14B-AWQ",
+      "model_clean": "Qwen/Qwen3-14B-AWQ",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "AWQ",
+      "params_b": 14.0,
+      "name_params_b": 14.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 302.43330149293365
+    },
+    {
+      "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+      "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 30.0,
+      "name_params_b": 30.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 291.6914862601304
+    },
+    {
+      "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+      "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 30.0,
+      "name_params_b": 30.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 256.91287782898553
+    },
+    {
+      "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+      "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 80.0,
+      "name_params_b": 80.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 178.05541683872298
+    },
+    {
+      "model": "google/gemma-3-12b-it",
+      "model_clean": "google/gemma-3-12b-it",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 12.0,
+      "name_params_b": 12.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 260.10671430704866
+    },
+    {
+      "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 8.0,
+      "name_params_b": 8.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 513.235753981134
+    },
+    {
+      "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
+      "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": null,
+      "name_params_b": null,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 99.38780646163637
+    },
+    {
+      "model": "openai/gpt-oss-120b",
+      "model_clean": "openai/gpt-oss-120b",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 120.0,
+      "name_params_b": 120.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 109.82173992256857
+    },
+    {
+      "model": "openai/gpt-oss-20b",
+      "model_clean": "openai/gpt-oss-20b",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 20.0,
+      "name_params_b": 20.0,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 521.2677328949931
+    },
+    {
+      "model": "zai-org/GLM-4.7-Flash",
+      "model_clean": "zai-org/GLM-4.7-Flash",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": null,
+      "name_params_b": null,
+      "backend": "Triton",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 320.9476287228403
+    },
     {
       "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
       "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -445,6 +634,7 @@
       "name_params_b": 8.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -460,6 +650,7 @@
       "name_params_b": 12.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -475,6 +666,7 @@
       "name_params_b": 14.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -490,6 +682,7 @@
       "name_params_b": 20.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -505,6 +698,7 @@
       "name_params_b": 120.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -520,6 +714,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -535,6 +730,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -550,6 +746,7 @@
       "name_params_b": 80.0,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -565,6 +762,7 @@
       "name_params_b": null,
       "backend": "ROCm",
       "network": "Ethernet",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -580,6 +778,7 @@
       "name_params_b": 8.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -595,6 +794,7 @@
       "name_params_b": 12.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -610,6 +810,7 @@
       "name_params_b": 14.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -625,6 +826,7 @@
       "name_params_b": 20.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -640,6 +842,7 @@
       "name_params_b": 120.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -655,6 +858,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -670,6 +874,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -685,6 +890,7 @@
       "name_params_b": 80.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 2,
@@ -700,6 +906,7 @@
       "name_params_b": 8.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -715,6 +922,7 @@
       "name_params_b": 14.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -730,6 +938,7 @@
       "name_params_b": 20.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -745,6 +954,7 @@
       "name_params_b": 120.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -760,6 +970,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -775,6 +986,7 @@
       "name_params_b": 30.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -790,6 +1002,7 @@
       "name_params_b": 80.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
@@ -805,10 +1018,155 @@
       "name_params_b": 12.0,
       "backend": "ROCm",
       "network": "RoCE",
+      "tag": "",
       "error": false,
       "test": "Throughput",
       "tp": 1,
       "tps_mean": 267.99881204205957
+    },
+    {
+      "model": "Qwen/Qwen3-14B-AWQ",
+      "model_clean": "Qwen/Qwen3-14B-AWQ",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "AWQ",
+      "params_b": 14.0,
+      "name_params_b": 14.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 311.59947769256274
+    },
+    {
+      "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+      "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 30.0,
+      "name_params_b": 30.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 299.24102990342374
+    },
+    {
+      "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+      "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 30.0,
+      "name_params_b": 30.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 267.6893183038276
+    },
+    {
+      "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+      "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "GPTQ",
+      "params_b": 80.0,
+      "name_params_b": 80.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 174.78952518165474
+    },
+    {
+      "model": "google/gemma-3-12b-it",
+      "model_clean": "google/gemma-3-12b-it",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 12.0,
+      "name_params_b": 12.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 342.7783704164132
+    },
+    {
+      "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 8.0,
+      "name_params_b": 8.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 550.8856190566602
+    },
+    {
+      "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
+      "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": null,
+      "name_params_b": null,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 98.10548978313048
+    },
+    {
+      "model": "openai/gpt-oss-120b",
+      "model_clean": "openai/gpt-oss-120b",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 120.0,
+      "name_params_b": 120.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 111.32822174858649
+    },
+    {
+      "model": "openai/gpt-oss-20b",
+      "model_clean": "openai/gpt-oss-20b",
+      "env": "TP2",
+      "gpu_config": "dual",
+      "quant": "BF16",
+      "params_b": 20.0,
+      "name_params_b": 20.0,
+      "backend": "ROCm",
+      "network": "Ethernet",
+      "tag": "usb",
+      "error": false,
+      "test": "Throughput",
+      "tp": 2,
+      "tps_mean": 519.7896706376232
     }
   ]
 }
\ No newline at end of file
diff --git a/rdma_cluster/compare_eth_vs_rdma.sh b/rdma_cluster/compare_eth_vs_rdma.sh
index d4a5cc9..6956029 100755
--- a/rdma_cluster/compare_eth_vs_rdma.sh
+++ b/rdma_cluster/compare_eth_vs_rdma.sh
@@ -3,62 +3,140 @@
 # -------- dynamic config --------
 HOST_ROCE="192.168.100.2"
 HOST_ETH="192.168.1.127"
+HOST_TB="192.168.2.2"
 
-# Automatically detect local and remote RDMA device names
-RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
-RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
+# Parse args
+RUN_ETH=true
+RUN_ROCE=true
+RUN_TB=true
+RUN_RDMA=true
+
+# If any flags are provided, turn off defaults and only run requested
+if [ "$#" -gt 0 ]; then
+    RUN_ETH=false
+    RUN_ROCE=false
+    RUN_TB=false
+    RUN_RDMA=false
+fi
+
+while getopts "ertih" opt; do
+    case ${opt} in
+        e ) RUN_ETH=true ;;
+        r ) RUN_ROCE=true ;;
+        t ) RUN_TB=true ;;
+        i ) RUN_RDMA=true ;;
+        h ) echo "Usage: $0 [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)]"
+            echo
+            echo "Options:"
+            echo "  -e    Run benchmarking for standard Ethernet (1G LAN)."
+            echo "  -r    Run benchmarking for RoCE NIC (via Ethernet/TCP)."
+            echo "  -t    Run benchmarking for Thunderbolt link."
+            echo "  -i    Run benchmarking for RDMA (RoCE v2)."
+            echo "  -h    Print this help message and exit."
+            echo
+            echo "If no arguments are provided, all benchmarks are executed."
+            exit 0
+            ;;
+        \? ) echo "Usage: cmd [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)] [-h (Help)]"
+             exit 1
+             ;;
+    esac
+done
+
+# Automatically detect local and remote RDMA device names if needed
+if [ "$RUN_RDMA" = true ]; then
+    RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
+    RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
+fi
 
 WORKDIR="/tmp/rdma_bench"
 mkdir -p "$WORKDIR"
 
 # -------- helpers --------
 parse_ping_avg() {
-    grep rtt "$1" | awk -F'/' '{print $5}'
+    if [ -f "$1" ]; then
+        grep rtt "$1" | awk -F'/' '{print $5}'
+    else
+        echo "0"
+    fi
 }
 
 parse_iperf_gbps() {
-    grep receiver "$1" | tail -n1 | awk '
-    {
-        val=$(NF-2);
-        unit=$(NF-1);
-        if (unit=="Mbits/sec") printf "%.2f", val/1000;
-        else if (unit=="Gbits/sec") printf "%.2f", val;
-        else print "N/A";
-    }'
+    if [ -f "$1" ]; then
+        grep receiver "$1" | tail -n1 | awk '
+        {
+            val=$(NF-2);
+            unit=$(NF-1);
+            if (unit=="Mbits/sec") printf "%.2f", val/1000;
+            else if (unit=="Gbits/sec") printf "%.2f", val;
+            else print "0.00";
+        }'
+    else
+        echo "0.00"
+    fi
 }
 
 parse_rdma_lat_us() {
-    val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
-    echo "${val:-0}"
+    if [ -f "$1" ]; then
+        val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
+        echo "${val:-0}"
+    else
+        echo "0"
+    fi
 }
 
 parse_rdma_bw_mib() {
-    val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
-    echo "${val:-0}"
+    if [ -f "$1" ]; then
+        val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
+        echo "${val:-0}"
+    else
+        echo "0"
+    fi
 }
 
-# -------- normal ethernet --------
-ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
-ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
-sleep 1
-iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
+# Clear old results
+rm -f "$WORKDIR"/*.txt
 
-# -------- roce ethernet (tcp) --------
-ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
-ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
-sleep 1
-iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
+if [ "$RUN_ETH" = true ]; then
+    # -------- normal ethernet --------
+    echo "[*] Benchmarking Ethernet (1G LAN)..."
+    ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
+    ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
+    sleep 1
+    iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
+fi
 
-# -------- rdma latency --------
-ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
-sleep 2
-ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
+if [ "$RUN_ROCE" = true ]; then
+    # -------- roce ethernet (tcp) --------
+    echo "[*] Benchmarking RoCE NIC (Ethernet/TCP)..."
+    ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
+    ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
+    sleep 1
+    iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
+fi
 
-# -------- rdma bandwidth (maximized) --------
-# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
-ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
-sleep 2
-ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
+if [ "$RUN_TB" = true ]; then
+    # -------- thunderbolt ethernet (tcp) --------
+    echo "[*] Benchmarking Thunderbolt..."
+    ping -c 10 "$HOST_TB" > "$WORKDIR/ping_tb.txt"
+    ssh "$HOST_TB" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
+    sleep 1
+    iperf3 -c "$HOST_TB" -P 8 -t 10 > "$WORKDIR/iperf_tb.txt"
+fi
+
+if [ "$RUN_RDMA" = true ]; then
+    # -------- rdma latency --------
+    echo "[*] Benchmarking RDMA (RoCE v2)..."
+    ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
+    sleep 2
+    ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
+
+    # -------- rdma bandwidth (maximized) --------
+    # We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
+    ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
+    sleep 2
+    ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
+fi
 
 # -------- parse --------
 ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
@@ -67,13 +145,17 @@ ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt")
 ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
 ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt")
 
+TB_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_tb.txt")
+TB_BW=$(parse_iperf_gbps "$WORKDIR/iperf_tb.txt")
+
 RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
 RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt")
 
 # Convert units for dual display
-ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')")
-ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')")
-RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')")
+ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
+ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
+TB_LAT_US=$(python3 -c "print(f'{float(${TB_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
+RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')" 2>/dev/null || echo "0.00")
 
 RDMA_BW_GBPS=$(python3 - <<EOF
 import sys
@@ -88,9 +170,18 @@ EOF
 echo
 echo "=== Network Comparison ==="
 echo
-printf "%-20s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
-echo "----------------------------------------------------------------"
-printf "%-20s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS} ms" "${ETH_LAT_US} us" "${ETH_BW} Gbps"
-printf "%-20s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS} ms" "${ROCE_LAT_US} us" "${ROCE_BW} Gbps"
-printf "%-20s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS} ms" "${RDMA_LAT_US} us" "${RDMA_BW_GBPS} Gbps"
+printf "%-25s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
+echo "-----------------------------------------------------------------------"
+if [ "$RUN_ETH" = true ]; then
+    printf "%-25s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS:-0.00} ms" "${ETH_LAT_US:-0.00} us" "${ETH_BW:-0.00} Gbps"
+fi
+if [ "$RUN_ROCE" = true ]; then
+    printf "%-25s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS:-0.00} ms" "${ROCE_LAT_US:-0.00} us" "${ROCE_BW:-0.00} Gbps"
+fi
+if [ "$RUN_TB" = true ]; then
+    printf "%-25s %-15s %-15s %-12s\n" "Ethernet (Thunderbolt)" "${TB_LAT_MS:-0.00} ms" "${TB_LAT_US:-0.00} us" "${TB_BW:-0.00} Gbps"
+fi
+if [ "$RUN_RDMA" = true ]; then
+    printf "%-25s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS:-0.00} ms" "${RDMA_LAT_US:-0.00} us" "${RDMA_BW_GBPS:-0.00} Gbps"
+fi
 echo
diff --git a/rdma_cluster/setup_guide.md b/rdma_cluster/setup_guide.md
index bd45053..b0c6d9c 100644
--- a/rdma_cluster/setup_guide.md
+++ b/rdma_cluster/setup_guide.md
@@ -333,3 +333,61 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
 
 *   **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
     *   Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
+
+---
+
+## 9. Alternative: Thunderbolt Networking
+
+If you do not have dedicated 100GbE RDMA network cards, you can directly connect the two nodes using a high-quality **Thunderbolt 4 / USB4 cable**. This will create a `thunderbolt0` network interface.
+
+While it lacks the ultra-low microprocessor-level latency of RDMA, it provides significantly more bandwidth than standard 1GbE/5GbE Ethernet and is easier to configure.
+
+>**Note**: `thunderbolt-net` relies on standard OS kernel TCP/IP stacks.
+
+### 9.1 Thunderbolt Configuration
+
+**1. Establish Connection:**
+Connect the nodes directly using a certified Thunderbolt 4 or USB4 cable. Verify the link is active:
+```bash
+ip link show thunderbolt0
+```
+
+**2. Network Configuration (Head - Node 1):**
+Configure a persistent connection using `nmcli` with a static IP and Jumbo Frames (reduces CPU overhead).
+*Note: Jumbo Frames may be unsupported on some Thunderbolt host controllers.*
+```bash
+sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.1/24 mtu 9000
+sudo nmcli connection up thunderbolt0
+```
+
+**3. Network Configuration (Worker - Node 2):**
+```bash
+sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.2/24 mtu 9000
+sudo nmcli connection up thunderbolt0
+```
+
+**4. Firewall Rules:**
+To ensure Ray and NCCL can communicate freely over this link:
+```bash
+# Assign the interface to the trusted zone permanently
+sudo firewall-cmd --permanent --zone=trusted --add-interface=thunderbolt0
+sudo firewall-cmd --reload
+```
+
+### 9.2 Running vLLM over Thunderbolt
+
+Our cluster scripts dynamically detect the network interface based on the provided IPs. There is no need to manually export environment variables!
+
+1. Open the Toolbox: `toolbox enter vllm`
+2. Launch the cluster manager: `start-vllm-cluster`
+3. Select **Option 1 (Configure IPs)**.
+4. Set the **Head IP** explicitly to `192.168.2.1` and the **Worker IP** to `192.168.2.2`.
+5. Start the cluster normally (Option 2). The script will automatically discover and utilize `thunderbolt0` as the backend network for Ray orchestration and GPU synchronization.
+
+### 9.3 Validating the Link
+I have added Thunderbolt support to the `compare_eth_vs_rdma.sh` script. Run it from inside the toolbox to see the latency and bandwidth of your Thunderbolt link compared to your other network interfaces.
+
+You can use the `-t` flag to ONLY benchmark the Thunderbolt connection (or `-e`, `-r`, `-i` for the others):
+```bash
+/opt/compare_eth_vs_rdma.sh -t
+```