diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..c8d59e9 --- /dev/null +++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 524.2037815230142, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.3815310134141399, + "tokens_per_second": 280.05330212131406 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json index 39c9888..b531b4d 100644 --- a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 421.75657659699937, + "elapsed_time": 424.04632396099623, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.4742071875054738, - "tokens_per_second": 348.0799308087054 + "requests_per_second": 0.4716465836369236, + "tokens_per_second": 346.2003835540928 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json index 964d4fc..e4c661b 100644 --- a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json +++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 868.8101008250001, + "elapsed_time": 918.187000697013, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.2301999019234296, - "tokens_per_second": 168.9724830093454 + "requests_per_second": 0.21782055272855774, + "tokens_per_second": 159.8857312165796 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..b1173a2 --- /dev/null +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 456.08530166203855, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.4385144604993234, + "tokens_per_second": 321.88057686801585 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json index dcc2f2e..ff0faf9 100644 --- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 457.7749735690013, + "elapsed_time": 458.737264430034, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.4368958801760569, - "tokens_per_second": 320.69249844623016 + "requests_per_second": 0.4359794058773347, + "tokens_per_second": 320.0197833991106 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json index f96f2c5..000ae7f 100644 --- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 644.1538858940003, + "elapsed_time": 686.8188757880125, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.3104848148551126, - "tokens_per_second": 227.90361622402403 + "requests_per_second": 0.29119758796747197, + "tokens_per_second": 213.74630950782364 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..8f5724f --- /dev/null +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 534.8865945799625, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.3739110346503573, + "tokens_per_second": 274.46004720922855 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json index f371702..1651003 100644 --- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 534.4193308840004, + "elapsed_time": 524.8208868440124, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.3742379596733028, - "tokens_per_second": 274.7000183491961 + "requests_per_second": 0.38108239403864297, + "tokens_per_second": 279.7240042842149 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json index 3036af0..2853531 100644 --- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json +++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 733.5017090729998, + "elapsed_time": 789.1420173590304, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.2726646680247824, - "tokens_per_second": 200.1426829468909 + "requests_per_second": 0.2534398062712803, + "tokens_per_second": 186.03115379827653 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..2f265ac --- /dev/null +++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 805.9022228560061, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.24816906360082697, + "tokens_per_second": 182.16229690959702 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json index 832db98..ee4d91e 100644 --- a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 879.0596038709991, + "elapsed_time": 748.1414223780157, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.22751585799106944, - "tokens_per_second": 167.00232766189475 + "requests_per_second": 0.2673291359329993, + "tokens_per_second": 196.2262690032198 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json index d0f0fcd..b764558 100644 --- a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json +++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1109.9732099440007, + "elapsed_time": 1168.3619703819859, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.18018452896722634, - "tokens_per_second": 132.2599488751683 + "requests_per_second": 0.17117982703135376, + "tokens_per_second": 125.65027253668944 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..50cb46f --- /dev/null +++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 510.63144373201067, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.391671923958063, + "tokens_per_second": 291.5155379231269 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json index 0e851a2..55666a7 100644 --- a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 504.69023761399876, + "elapsed_time": 520.7929677469656, "num_requests": 200, "total_num_tokens": 148857, - "requests_per_second": 0.39628268013570256, - "tokens_per_second": 294.9472545848014 + "requests_per_second": 0.3840297630462106, + "tokens_per_second": 285.8275921888489 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json index 9a618f6..34e2f33 100644 --- a/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json +++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 876.911706677, + "elapsed_time": 930.6109793490032, "num_requests": 200, "total_num_tokens": 148857, - "requests_per_second": 0.22807313265081958, - "tokens_per_second": 169.75141153501525 + "requests_per_second": 0.2149125729635249, + "tokens_per_second": 159.95620436815713 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..13dd005 --- /dev/null +++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 237.61095946098794, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.8417120172137385, + "tokens_per_second": 613.9321196754427 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json index b3657d4..ea7c603 100644 --- a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 244.51837097500174, + "elapsed_time": 247.22850671299966, "num_requests": 200, "total_num_tokens": 145877, - "requests_per_second": 0.8179344529513773, - "tokens_per_second": 596.5891209659404 + "requests_per_second": 0.8089681997399035, + "tokens_per_second": 590.0492703672895 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json index c439d54..6f80c84 100644 --- a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json +++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 380.55349342600005, + "elapsed_time": 395.08209386101225, "num_requests": 200, "total_num_tokens": 145877, - "requests_per_second": 0.5255502930730307, - "tokens_per_second": 383.3285005130725 + "requests_per_second": 0.5062239041143659, + "tokens_per_second": 369.23212230245684 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..f33f00b --- /dev/null +++ b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1361.426551499986, + "num_requests": 200, + "total_num_tokens": 146523, + "requests_per_second": 0.14690473002722398, + "tokens_per_second": 107.62460878889469 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_throughput.json b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_throughput.json new file mode 100644 index 0000000..9f821f8 --- /dev/null +++ b/benchmarks/benchmark_results/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1482.2689266130328, + "num_requests": 200, + "total_num_tokens": 146523, + "requests_per_second": 0.13492828218223374, + "tokens_per_second": 98.85048345093716 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..11d2b5e --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1724.1368565150187, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.11600007229371459, + "tokens_per_second": 85.2809331488931 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json index 5e00991..dc1f6da 100644 --- a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1307.2402118169994, + "elapsed_time": 1199.1163451180328, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.15299406963775225, - "tokens_per_second": 112.4781801162827 + "requests_per_second": 0.16678948695367285, + "tokens_per_second": 122.62029501860121 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json index 82e552b..29954d6 100644 --- a/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json +++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1886.751298176, + "elapsed_time": 1959.4152568069985, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.10600231211890418, - "tokens_per_second": 77.93077982357597 + "requests_per_second": 0.10207126810164463, + "tokens_per_second": 75.0407548829671 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..7a5e6da --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 243.98866786801955, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.819710201082723, + "tokens_per_second": 602.6345456319963 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json index 4ae2cb0..2af6db1 100644 --- a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 247.62527259899798, + "elapsed_time": 242.14750060701044, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.8076720033495051, - "tokens_per_second": 593.7843034224891 + "requests_per_second": 0.825942863331829, + "tokens_per_second": 607.216674264294 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json index 2220af1..24cf6b6 100644 --- a/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json +++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 341.2666312900001, + "elapsed_time": 357.72086531698005, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.5860520240258851, - "tokens_per_second": 430.8537270233502 + "requests_per_second": 0.5590951476167821, + "tokens_per_second": 411.03557062490586 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..25294a2 --- /dev/null +++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 486.3392907420057, + "num_requests": 200, + "total_num_tokens": 146278, + "requests_per_second": 0.41123553824915293, + "tokens_per_second": 300.773560320048 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json index 4ca4bb0..939435d 100644 --- a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 422.7612150579989, + "elapsed_time": 398.827027003048, "num_requests": 200, "total_num_tokens": 146278, - "requests_per_second": 0.47308029420949094, - "tokens_per_second": 346.0061963818796 + "requests_per_second": 0.5014705284716613, + "tokens_per_second": 366.77052981888835 } \ No newline at end of file diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json index de26852..26e68d8 100644 --- a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json +++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 594.5536415039987, + "elapsed_time": 610.5734472059994, "num_requests": 200, "total_num_tokens": 146278, - "requests_per_second": 0.33638680522429343, - "tokens_per_second": 246.02994547299596 + "requests_per_second": 0.32756091984544267, + "tokens_per_second": 239.57478116575834 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..862fc3c --- /dev/null +++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 497.111974740983, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.40232384284085837, + "tokens_per_second": 295.31575874126105 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json index 1a4060e..ace2f01 100644 --- a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 395.26841144900027, + "elapsed_time": 399.3928133630543, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.5059852854591319, - "tokens_per_second": 371.4058491591393 + "requests_per_second": 0.5007601371589951, + "tokens_per_second": 367.5704596781314 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json index 84a4499..5542fca 100644 --- a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 769.1666062429999, + "elapsed_time": 813.6141017450136, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.260021688898978, - "tokens_per_second": 190.86242019407229 + "requests_per_second": 0.24581678165489804, + "tokens_per_second": 180.43566315423652 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..ad5aff8 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 456.45958357997006, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.4381548929949473, + "tokens_per_second": 321.6166453306162 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json index 58e09ea..0b7d5a2 100644 --- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 464.71097393700256, + "elapsed_time": 440.66104900900973, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.43037503139986644, - "tokens_per_second": 315.906032423287 + "requests_per_second": 0.4538635771184551, + "tokens_per_second": 333.147212194374 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json index ca60feb..502b15a 100644 --- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 638.3282979609994, + "elapsed_time": 683.9224744850071, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.31331839844615444, - "tokens_per_second": 229.9835374194385 + "requests_per_second": 0.29243080533447857, + "tokens_per_second": 214.65152188564062 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..c350ba8 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 517.5916094129789, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.38640502736670695, + "tokens_per_second": 283.6309502128471 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json index 7b2b31b..62b3241 100644 --- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 502.6907218439992, + "elapsed_time": 497.59323585999664, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.3978589444944367, - "tokens_per_second": 292.0384117325289 + "requests_per_second": 0.4019347241614679, + "tokens_per_second": 295.0301359026215 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json index d048712..914f41a 100644 --- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 721.7994779089986, + "elapsed_time": 780.1687226030044, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.2770852655357769, - "tokens_per_second": 203.38751203489863 + "requests_per_second": 0.2563548040386794, + "tokens_per_second": 188.17083503449163 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..f9c27ad --- /dev/null +++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 802.5698999410379, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.24919947784572202, + "tokens_per_second": 182.9186467257061 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json index 89f11e0..769c7c2 100644 --- a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 886.8526372269989, + "elapsed_time": 757.2171181479935, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.2255166096425645, - "tokens_per_second": 165.5348293928834 + "requests_per_second": 0.2641250378612165, + "tokens_per_second": 193.87438091607942 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json index f5d2546..bc4aeed 100644 --- a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1084.3601952080007, + "elapsed_time": 1144.2253085140255, "num_requests": 200, "total_num_tokens": 146805, - "requests_per_second": 0.18444055848217136, - "tokens_per_second": 135.3839809398758 + "requests_per_second": 0.1747907501362075, + "tokens_per_second": 128.30078036872973 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..780ed0d --- /dev/null +++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 373.92354663898004, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.5348686965496139, + "tokens_per_second": 398.09474781142933 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json index 0e86f1e..3675e9b 100644 --- a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 369.2837602610016, + "elapsed_time": 374.03978066996206, "num_requests": 200, "total_num_tokens": 148857, - "requests_per_second": 0.5415889392445647, - "tokens_per_second": 403.09652364564084 + "requests_per_second": 0.5347024844303181, + "tokens_per_second": 397.9710386242193 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json index e96b629..79cdd40 100644 --- a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 509.0738683320001, + "elapsed_time": 555.4390292470343, "num_requests": 200, "total_num_tokens": 148857, - "requests_per_second": 0.39287029337276264, - "tokens_per_second": 292.4074663029466 + "requests_per_second": 0.36007552488906747, + "tokens_per_second": 267.99881204205957 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..db30ee2 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 213.75922767800512, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.9356321229849724, + "tokens_per_second": 682.4360360233941 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json index efbb6ad..b00a77d 100644 --- a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 224.76228898300178, + "elapsed_time": 224.3753512299736, "num_requests": 200, "total_num_tokens": 145877, - "requests_per_second": 0.8898289873490544, - "tokens_per_second": 649.02791593759 + "requests_per_second": 0.8913635071929533, + "tokens_per_second": 650.1471716939323 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json index 4a572c5..cd4f7c7 100644 --- a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 322.171811016, + "elapsed_time": 336.45260514499387, "num_requests": 200, "total_num_tokens": 145877, - "requests_per_second": 0.620786776376495, - "tokens_per_second": 452.7925628873698 + "requests_per_second": 0.5944373648520577, + "tokens_per_second": 433.5736973626181 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..b995d3e --- /dev/null +++ b/benchmarks/benchmark_results_rocm/mratsim_MiniMax-M2.5-BF16-INT4-AWQ_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1484.8385301349917, + "num_requests": 200, + "total_num_tokens": 146523, + "requests_per_second": 0.1346947805710681, + "tokens_per_second": 98.67941666807306 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..4431855 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1707.9124416089617, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.11710202181769186, + "tokens_per_second": 86.0910643999307 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json index 0041800..58e786c 100644 --- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1315.035868578001, + "elapsed_time": 1242.463667072996, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.15208710635115047, - "tokens_per_second": 111.8113988472388 + "requests_per_second": 0.16097050183460196, + "tokens_per_second": 118.34229353876268 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json index e488eec..dd58d43 100644 --- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 1923.4690410719995, + "elapsed_time": 1966.935257990961, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.10397879858182421, - "tokens_per_second": 76.44313314138553 + "requests_per_second": 0.10168102848706935, + "tokens_per_second": 74.75385852312364 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json new file mode 100644 index 0000000..86844e3 --- /dev/null +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_eth_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 299.5004001749912, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.6677787404729495, + "tokens_per_second": 490.93757442090305 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json index ef3781e..11ff708 100644 --- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 246.0529060009976, + "elapsed_time": 244.54776988498634, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.8128333180474167, - "tokens_per_second": 597.5787987620997 + "requests_per_second": 0.8178361229548825, + "tokens_per_second": 601.2567608739705 } \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json index 2f47f62..6d9c5ba 100644 --- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json +++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json @@ -1,7 +1,7 @@ { - "elapsed_time": 333.59849170300004, + "elapsed_time": 362.9645123449736, "num_requests": 200, "total_num_tokens": 147036, - "requests_per_second": 0.5995230943012126, - "tokens_per_second": 440.75738846836555 + "requests_per_second": 0.5510180560294371, + "tokens_per_second": 405.0974544317216 } \ No newline at end of file diff --git a/benchmarks/find_max_context.py b/benchmarks/find_max_context.py index 79ec0ff..a3eeed8 100644 --- a/benchmarks/find_max_context.py +++ b/benchmarks/find_max_context.py @@ -15,18 +15,21 @@ except ImportError: print("Error: 'transformers' not found. Please install it or run in vLLM environment.") sys.exit(1) + # Import path handling for scripts/models.py try: import sys, os sys.path.append(str(Path(__file__).parent.parent / "scripts")) import models + import cluster_manager # Import shared cluster logic except ImportError: - print("Error: Could not import scripts/models.py.") + print("Error: Could not import scripts/models.py or cluster_manager.py.") sys.exit(1) # Import Utils from run_vllm_bench (keep utils shared) try: - from run_vllm_bench import get_gpu_count, kill_vllm + from run_vllm_bench import kill_vllm + # We do NOT import get_gpu_count because we are overriding it for cluster awareness except ImportError: print("Error: Could not import run_vllm_bench.py.") sys.exit(1) @@ -65,7 +68,30 @@ CONCURRENCY_STEPS = [1, 4, 8, 16] def log(msg): print(f"[MAX-CTX] {msg}", flush=True) +def get_gpu_count(): + """ + Returns total GPUs. + If Ray Cluster is active, returns TOTAL cluster GPUs (e.g., 2). + Otherwise returns local AMD GPUs. + """ + if cluster_manager.check_ray_status(): + # Ideally we'd query Ray for total resources, but for this specific 2-node setup: + # If cluster is up, we assume 2 nodes x 1 GPU = 2 GPUs. + # Constructing a Ray client just to count is slow/complex here. + log("Ray Cluster Detected: Assuming 2 GPUs available.") + return 2 + + # Local Fallback + try: + res = subprocess.run("rocm-smi --showid", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode == 0: + return res.stdout.count("GPU") + except: pass + return 1 + + def get_hf_context_limit(model_name, trust_remote=False): + # ... (Keep existing implementation) try: cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote) @@ -95,6 +121,7 @@ def get_hf_context_limit(model_name, trust_remote=False): def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs): """ Constructs the vLLM serve command. + Using Ray Backend if tp_size > 1 (Cluster Mode). """ config = MODEL_TABLE[model] @@ -105,15 +132,45 @@ def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs): "--tensor-parallel-size", str(tp_size), "--max-num-seqs", str(max_seqs), "--dtype", "auto", - # "--disable-log-stats" # Cleaner output, but user managed without it + # "--disable-log-stats" ] - if config.get("trust_remote"): cmd.append("--trust-remote-code") - if config.get("enforce_eager"): cmd.append("--enforce-eager") - - # Add model specific env vars + # Env Setup env = os.environ.copy() env.update(config.get("env", {})) + + # CLUSTER / RAY LOGIC + # Only if we need more than 1 GPU do we engage the cluster machinery + if tp_size > 1: + log(f"TP={tp_size} > 1: Using Ray Distributed Backend") + cmd.extend(["--distributed-executor-backend", "ray"]) + + # Inject Cluster Env Vars (similar to start_vllm_cluster.py) + # We need to know Head IP and RDMA Interface + rdma_iface = cluster_manager.get_net_iface() + head_ip = cluster_manager.get_local_ip(rdma_iface) # Assuming we run this ON HEAD + + # IMPORTANT: vLLM needs to bind to the Head IP for Ray workers to reach it? + # Or at least we should be explicit. + cmd.extend(["--host", head_ip]) + + # Update our own process env so verify_context knows where to look? + # No, verify_context runs in THIS process. We need to export it or pass it. + # Simplest is to set it in os.environ for OUR process too, but that might be messy. + # Better: We rely on standard PORT. + + env["RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"] = "1" + env["VLLM_HOST_IP"] = head_ip + env["NCCL_SOCKET_IFNAME"] = rdma_iface + env["NCCL_IB_GID_INDEX"] = "1" + env["NCCL_IB_DISABLE"] = "0" + env["NCCL_NET_GDR_LEVEL"] = "0" + else: + # Default Localhost bind for single node safety + cmd.extend(["--host", "127.0.0.1"]) + + if config.get("trust_remote"): cmd.append("--trust-remote-code") + if config.get("enforce_eager"): cmd.append("--enforce-eager") return cmd, env @@ -300,7 +357,14 @@ def verify_context(model, context_len): """ Sends a request to the server with length ~context_len to verify stability. """ - url = f"http://{HOST}:{PORT}/v1/completions" + # Use dynamic host if set (by cluster logic), else localhost + # But wait, the env var is set for the SERVER process, not necessarily us? + # Actually, we (the client script) need to know where to send requests. + # If we are on Head, localhost is fine for Head-based server. + # But if we use Ray, vLLM head usually binds to HOST IP. + + target_host = os.getenv("VLLM_HOST_IP", "127.0.0.1") + url = f"http://{target_host}:{PORT}/v1/completions" # We use a simple "A " * N prompt. # Llama 3 tokenizer: "A" is usually 1 token. @@ -529,9 +593,22 @@ def main(): continue config = MODEL_TABLE[model] - valid_tps = [t for t in config["valid_tp"] if t <= gpu_count] - for tp in valid_tps: + # KEY CHANGES: + # We only want to test the MINIMUM required TP. + # If model supports 1 and 2, we ONLY test 1 (local is faster/easier). + # We only test 2 if model VALID_TP *starts* with 2 (or higher). + + valid_tps = config.get("valid_tp", [1]) + min_tp = min(valid_tps) + + if min_tp > gpu_count: + log(f"Skipping {model}: Requires TP={min_tp} but only {gpu_count} GPUs available.") + continue + + tps_to_test = [min_tp] + + for tp in tps_to_test: # Track successful seqs for this TP to skip lower utils # effectively: {seqs_count: max_working_util} # Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip. diff --git a/docs/index.html b/docs/index.html index ecc9862..f593dfb 100644 --- a/docs/index.html +++ b/docs/index.html @@ -469,6 +469,10 @@ style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;"> TP2 + @@ -544,6 +548,7 @@ activeTab: "Throughput", showTP1: true, showTP2: true, + showTP2Eth: true, showTriton: true, showRocm: false }; @@ -615,6 +620,7 @@ // Toggles $('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); }); $('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); }); + $('toggleTP2Eth').addEventListener('change', e => { state.showTP2Eth = e.target.checked; render(); }); $('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); }); $('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); }); } @@ -636,13 +642,17 @@ params: run.params_b || run.name_params_b, results: { 1: { triton: null, rocm: null }, - 2: { triton: null, rocm: null } + 2: { triton: null, rocm: null }, + "2_eth": { triton: null, rocm: null } } }; } const m = testGroups[testName].models[modelName]; - const tp = run.tp || 1; + let tp = run.tp || 1; + if (tp === 2 && run.network === "Ethernet") { + tp = "2_eth"; + } if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null }; @@ -749,8 +759,12 @@ if (state.showRocm) cols.push({ id: "tp1_rocm", label: "TP1 ROCm" }); } if (state.showTP2) { - if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 Triton" }); - if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 ROCm" }); + if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 RoCE Triton" }); + if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 RoCE ROCm" }); + } + if (state.showTP2Eth) { + if (state.showTriton) cols.push({ id: "tp2_eth_triton", label: "TP2 Eth Triton" }); + if (state.showRocm) cols.push({ id: "tp2_eth_rocm", label: "TP2 Eth ROCm" }); } // Thead @@ -790,11 +804,7 @@ // Data Cells cols.forEach(c => { - let val = null; - if (c.id === "tp1_triton") val = m.results[1]?.triton; - if (c.id === "tp1_rocm") val = m.results[1]?.rocm; - if (c.id === "tp2_triton") val = m.results[2]?.triton; - if (c.id === "tp2_rocm") val = m.results[2]?.rocm; + let val = getVal(m, c.id); const bg = c.id.startsWith("tp2") ? 'style="background:#fbfdff;"' : ""; rowHtml += `${formatVal(val, unit)}`; @@ -823,6 +833,8 @@ if (colId === "tp1_rocm") return m.results[1]?.rocm; if (colId === "tp2_triton") return m.results[2]?.triton; if (colId === "tp2_rocm") return m.results[2]?.rocm; + if (colId === "tp2_eth_triton") return m.results["2_eth"]?.triton; + if (colId === "tp2_eth_rocm") return m.results["2_eth"]?.rocm; return null; } diff --git a/docs/parse_results.py b/docs/parse_results.py index 30b3096..820e43f 100644 --- a/docs/parse_results.py +++ b/docs/parse_results.py @@ -66,6 +66,11 @@ def parse_logs(): if not tp_match: continue tp = int(tp_match.group(1)) + # Network + network = "RoCE" + if "_eth" in rest: + network = "Ethernet" + # Model Name if "_" in model_part: model_display = model_part.replace("_", "/", 1) @@ -87,6 +92,7 @@ def parse_logs(): "params_b": params_b, "name_params_b": params_b, "backend": backend_name, # "Triton" or "ROCm" + "network": network, "error": False } diff --git a/docs/results.json b/docs/results.json index e457025..7eb8236 100644 --- a/docs/results.json +++ b/docs/results.json @@ -1,131 +1,5 @@ { "runs": [ - { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": 8.0, - "name_params_b": 8.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 383.3285005130725 - }, - { - "model": "google/gemma-3-12b-it", - "model_clean": "google/gemma-3-12b-it", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": 12.0, - "name_params_b": 12.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 169.75141153501525 - }, - { - "model": "Qwen/Qwen3-14B-AWQ", - "model_clean": "Qwen/Qwen3-14B-AWQ", - "env": "TP1", - "gpu_config": "single", - "quant": "AWQ", - "params_b": 14.0, - "name_params_b": 14.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 168.9724830093454 - }, - { - "model": "openai/gpt-oss-20b", - "model_clean": "openai/gpt-oss-20b", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": 20.0, - "name_params_b": 20.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 430.8537270233502 - }, - { - "model": "openai/gpt-oss-120b", - "model_clean": "openai/gpt-oss-120b", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": 120.0, - "name_params_b": 120.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 77.93077982357597 - }, - { - "model": "zai-org/GLM-4.7-Flash", - "model_clean": "zai-org/GLM-4.7-Flash", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": null, - "name_params_b": null, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 246.02994547299596 - }, - { - "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", - "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", - "env": "TP1", - "gpu_config": "single", - "quant": "GPTQ", - "params_b": 30.0, - "name_params_b": 30.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 227.90361622402403 - }, - { - "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", - "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", - "env": "TP1", - "gpu_config": "single", - "quant": "GPTQ", - "params_b": 30.0, - "name_params_b": 30.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 200.1426829468909 - }, - { - "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", - "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", - "env": "TP1", - "gpu_config": "single", - "quant": "GPTQ", - "params_b": 80.0, - "name_params_b": 80.0, - "backend": "Triton", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 132.2599488751683 - }, { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", @@ -135,10 +9,11 @@ "params_b": 8.0, "name_params_b": 8.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 596.5891209659404 + "tps_mean": 613.9321196754427 }, { "model": "google/gemma-3-12b-it", @@ -149,10 +24,11 @@ "params_b": 12.0, "name_params_b": 12.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 294.9472545848014 + "tps_mean": 291.5155379231269 }, { "model": "Qwen/Qwen3-14B-AWQ", @@ -163,10 +39,11 @@ "params_b": 14.0, "name_params_b": 14.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 348.0799308087054 + "tps_mean": 280.05330212131406 }, { "model": "openai/gpt-oss-20b", @@ -177,10 +54,11 @@ "params_b": 20.0, "name_params_b": 20.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 593.7843034224891 + "tps_mean": 602.6345456319963 }, { "model": "openai/gpt-oss-120b", @@ -191,10 +69,11 @@ "params_b": 120.0, "name_params_b": 120.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 112.4781801162827 + "tps_mean": 85.2809331488931 }, { "model": "zai-org/GLM-4.7-Flash", @@ -205,10 +84,11 @@ "params_b": null, "name_params_b": null, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 346.0061963818796 + "tps_mean": 300.773560320048 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", @@ -219,10 +99,11 @@ "params_b": 30.0, "name_params_b": 30.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 320.69249844623016 + "tps_mean": 321.88057686801585 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", @@ -233,10 +114,11 @@ "params_b": 30.0, "name_params_b": 30.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 274.7000183491961 + "tps_mean": 274.46004720922855 }, { "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", @@ -247,10 +129,176 @@ "params_b": 80.0, "name_params_b": 80.0, "backend": "Triton", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 167.00232766189475 + "tps_mean": 182.16229690959702 + }, + { + "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "Ethernet", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 107.62460878889469 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 590.0492703672895 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 285.8275921888489 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 346.2003835540928 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 607.216674264294 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 122.62029501860121 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "model_clean": "zai-org/GLM-4.7-Flash", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 366.77052981888835 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 320.0197833991106 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 279.7240042842149 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 196.2262690032198 + }, + { + "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 98.85048345093716 }, { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", @@ -260,25 +308,12 @@ "quant": "BF16", "params_b": 8.0, "name_params_b": 8.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 452.7925628873698 - }, - { - "model": "google/gemma-3-12b-it", - "model_clean": "google/gemma-3-12b-it", - "env": "TP1", - "gpu_config": "single", - "quant": "BF16", - "params_b": 12.0, - "name_params_b": 12.0, - "backend": "ROCm", - "error": false, - "test": "Throughput", - "tp": 1, - "tps_mean": 292.4074663029466 + "tps_mean": 369.23212230245684 }, { "model": "Qwen/Qwen3-14B-AWQ", @@ -288,11 +323,12 @@ "quant": "AWQ", "params_b": 14.0, "name_params_b": 14.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 190.86242019407229 + "tps_mean": 159.8857312165796 }, { "model": "openai/gpt-oss-20b", @@ -302,11 +338,12 @@ "quant": "BF16", "params_b": 20.0, "name_params_b": 20.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 440.75738846836555 + "tps_mean": 411.03557062490586 }, { "model": "openai/gpt-oss-120b", @@ -316,11 +353,27 @@ "quant": "BF16", "params_b": 120.0, "name_params_b": 120.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 76.44313314138553 + "tps_mean": 75.0407548829671 + }, + { + "model": "zai-org/GLM-4.7-Flash", + "model_clean": "zai-org/GLM-4.7-Flash", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 239.57478116575834 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", @@ -330,11 +383,12 @@ "quant": "GPTQ", "params_b": 30.0, "name_params_b": 30.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 229.9835374194385 + "tps_mean": 213.74630950782364 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", @@ -344,11 +398,12 @@ "quant": "GPTQ", "params_b": 30.0, "name_params_b": 30.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 203.38751203489863 + "tps_mean": 186.03115379827653 }, { "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", @@ -358,11 +413,27 @@ "quant": "GPTQ", "params_b": 80.0, "name_params_b": 80.0, - "backend": "ROCm", + "backend": "Triton", + "network": "RoCE", "error": false, "test": "Throughput", "tp": 1, - "tps_mean": 135.3839809398758 + "tps_mean": 125.65027253668944 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "Triton", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 159.95620436815713 }, { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", @@ -373,10 +444,11 @@ "params_b": 8.0, "name_params_b": 8.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 649.02791593759 + "tps_mean": 682.4360360233941 }, { "model": "google/gemma-3-12b-it", @@ -387,10 +459,11 @@ "params_b": 12.0, "name_params_b": 12.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 403.09652364564084 + "tps_mean": 398.09474781142933 }, { "model": "Qwen/Qwen3-14B-AWQ", @@ -401,10 +474,11 @@ "params_b": 14.0, "name_params_b": 14.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 371.4058491591393 + "tps_mean": 295.31575874126105 }, { "model": "openai/gpt-oss-20b", @@ -415,10 +489,11 @@ "params_b": 20.0, "name_params_b": 20.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 597.5787987620997 + "tps_mean": 490.93757442090305 }, { "model": "openai/gpt-oss-120b", @@ -429,10 +504,11 @@ "params_b": 120.0, "name_params_b": 120.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 111.8113988472388 + "tps_mean": 86.0910643999307 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", @@ -443,10 +519,11 @@ "params_b": 30.0, "name_params_b": 30.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 315.906032423287 + "tps_mean": 321.6166453306162 }, { "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", @@ -457,10 +534,11 @@ "params_b": 30.0, "name_params_b": 30.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 292.0384117325289 + "tps_mean": 283.6309502128471 }, { "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", @@ -471,10 +549,266 @@ "params_b": 80.0, "name_params_b": 80.0, "backend": "ROCm", + "network": "Ethernet", "error": false, "test": "Throughput", "tp": 2, - "tps_mean": 165.5348293928834 + "tps_mean": 182.9186467257061 + }, + { + "model": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "model_clean": "mratsim/MiniMax-M2.5-BF16-INT4-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": null, + "name_params_b": null, + "backend": "ROCm", + "network": "Ethernet", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 98.67941666807306 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 650.1471716939323 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 397.9710386242193 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP2", + "gpu_config": "dual", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 367.5704596781314 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 601.2567608739705 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP2", + "gpu_config": "dual", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 118.34229353876268 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 333.147212194374 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 295.0301359026215 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP2", + "gpu_config": "dual", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 2, + "tps_mean": 193.87438091607942 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 433.5736973626181 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP1", + "gpu_config": "single", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 180.43566315423652 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 405.0974544317216 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 74.75385852312364 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "env": "TP1", + "gpu_config": "single", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 214.65152188564062 + }, + { + "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "env": "TP1", + "gpu_config": "single", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 188.17083503449163 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP1", + "gpu_config": "single", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 128.30078036872973 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "ROCm", + "network": "RoCE", + "error": false, + "test": "Throughput", + "tp": 1, + "tps_mean": 267.99881204205957 } ] } \ No newline at end of file diff --git a/scripts/start_vllm.py b/scripts/start_vllm.py index 92b7b63..3eebc26 100644 --- a/scripts/start_vllm.py +++ b/scripts/start_vllm.py @@ -36,6 +36,22 @@ else: HOST = os.getenv("HOST", "0.0.0.0") PORT = os.getenv("PORT", "8000") +def detect_gpus(): + """Detects AMD GPUs via rocm-smi or /dev/dri.""" + try: + # Try rocm-smi first + res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if res.returncode == 0: + count = res.stdout.count("GPU") + if count > 0: return count + except: pass + + # Fallback to /dev/dri/render* + try: + return len(list(Path("/dev/dri").glob("renderD*"))) + except: + return 1 + def get_discovered_models(): """ Overrides the hardcoded MODELS_TO_RUN by looking at what we actually have results for. @@ -93,22 +109,6 @@ def check_dependencies(): print("Error: 'dialog' is required. Please install it (apt-get install dialog).") sys.exit(1) -def detect_gpus(): - """Detects AMD GPUs via rocm-smi or /dev/dri.""" - try: - # Try rocm-smi first - res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if res.returncode == 0: - count = res.stdout.count("GPU") - if count > 0: return count - except: pass - - # Fallback to /dev/dri/render* - try: - return len(list(Path("/dev/dri").glob("renderD*"))) - except: - return 1 - def get_verified_config(model_id, tp_size, max_seqs): """ Reads max_context_results.json to find the best verified configuration. @@ -334,7 +334,15 @@ def configure_and_launch(model_idx, gpu_count): print(f" Backend: {'ROCm' if use_rocm_attn else 'Triton'}") if clear_cache: print(f" Action: Clearing vLLM Cache (~/.cache/vllm)") - print(f" Command: {' '.join(cmd)}") + + # Variables that represent the custom environment overrides for models + custom_env = config.get("env", {}) + if custom_env: + print("\n --- Environment Variables ---") + for k, v in custom_env.items(): + print(f" export {k}={v}") + + print(f"\n Command: {' '.join(cmd)}") print("="*60 + "\n") os.execvpe("vllm", cmd, env)