diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
index 2be46b6..39c9888 100644
--- a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 229.17851571500069,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.4363410753753066,
-    "tokens_per_second": 328.49937859629955
+    "elapsed_time": 421.75657659699937,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.4742071875054738,
+    "tokens_per_second": 348.0799308087054
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
index be3df95..964d4fc 100644
--- a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
+++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 899.6009820629988,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.11116039443473733,
-    "tokens_per_second": 83.68710295019198
+    "elapsed_time": 868.8101008250001,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.2301999019234296,
+    "tokens_per_second": 168.9724830093454
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
index f267aff..dcc2f2e 100644
--- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 442.1101265470061,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.2261879880043141,
-    "tokens_per_second": 170.28562676904787
+    "elapsed_time": 457.7749735690013,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.4368958801760569,
+    "tokens_per_second": 320.69249844623016
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
index 70d2836..f96f2c5 100644
--- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 639.3201232059982,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.15641616206061223,
-    "tokens_per_second": 117.75790760733192
+    "elapsed_time": 644.1538858940003,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.3104848148551126,
+    "tokens_per_second": 227.90361622402403
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
index 24ead88..f371702 100644
--- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 577.3050836349939,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.1732186374842766,
-    "tokens_per_second": 130.40765123003763
+    "elapsed_time": 534.4193308840004,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.3742379596733028,
+    "tokens_per_second": 274.7000183491961
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
index 667e938..3036af0 100644
--- a/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
+++ b/benchmarks/benchmark_results/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 865.5675225800005,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.115531136960788,
-    "tokens_per_second": 86.97761646092924
+    "elapsed_time": 733.5017090729998,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.2726646680247824,
+    "tokens_per_second": 200.1426829468909
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
index 9360b46..832db98 100644
--- a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 764.7424081899953,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.13076298493329488,
-    "tokens_per_second": 98.44491320703105
+    "elapsed_time": 879.0596038709991,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.22751585799106944,
+    "tokens_per_second": 167.00232766189475
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
index 22eb3c4..d0f0fcd 100644
--- a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
+++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 1052.5878375879984,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.09500394782173208,
-    "tokens_per_second": 71.52372211759099
+    "elapsed_time": 1109.9732099440007,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.18018452896722634,
+    "tokens_per_second": 132.2599488751683
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json
index 7ada1ea..0e851a2 100644
--- a/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 310.4935437940003,
-    "num_requests": 100,
-    "total_num_tokens": 76062,
-    "requests_per_second": 0.3220678883627477,
-    "tokens_per_second": 244.97127724647316
+    "elapsed_time": 504.69023761399876,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.39628268013570256,
+    "tokens_per_second": 294.9472545848014
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json
index 1cd45b9..9a618f6 100644
--- a/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json
+++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 550.0459713920009,
-    "num_requests": 100,
-    "total_num_tokens": 76062,
-    "requests_per_second": 0.18180298593393945,
-    "tokens_per_second": 138.28298716107304
+    "elapsed_time": 876.911706677,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.22807313265081958,
+    "tokens_per_second": 169.75141153501525
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
index fa194ba..b3657d4 100644
--- a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 193.03236384499905,
-    "num_requests": 100,
-    "total_num_tokens": 74504,
-    "requests_per_second": 0.5180478444552329,
-    "tokens_per_second": 385.96636603292677
+    "elapsed_time": 244.51837097500174,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.8179344529513773,
+    "tokens_per_second": 596.5891209659404
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
index 89251dc..c439d54 100644
--- a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
+++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 311.826995067001,
-    "num_requests": 100,
-    "total_num_tokens": 74504,
-    "requests_per_second": 0.3206906444341466,
-    "tokens_per_second": 238.92735772921657
+    "elapsed_time": 380.55349342600005,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.5255502930730307,
+    "tokens_per_second": 383.3285005130725
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json
index 4fb2c8a..5e00991 100644
--- a/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 639.9174838529943,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.15627014814143225,
-    "tokens_per_second": 117.11822522607781
+    "elapsed_time": 1307.2402118169994,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.15299406963775225,
+    "tokens_per_second": 112.4781801162827
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json
index 868843d..82e552b 100644
--- a/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json
+++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 1055.754198749999,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.09471901709545542,
-    "tokens_per_second": 70.98811455236003
+    "elapsed_time": 1886.751298176,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.10600231211890418,
+    "tokens_per_second": 77.93077982357597
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json
index 4d612e7..4ae2cb0 100644
--- a/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 271.0714236530039,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.36890646255655896,
-    "tokens_per_second": 276.48063742763867
+    "elapsed_time": 247.62527259899798,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.8076720033495051,
+    "tokens_per_second": 593.7843034224891
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json
index 39de487..2220af1 100644
--- a/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json
+++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 404.31172934999995,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.24733390782594175,
-    "tokens_per_second": 185.3668705592303
+    "elapsed_time": 341.2666312900001,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.5860520240258851,
+    "tokens_per_second": 430.8537270233502
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json
index d4dab53..4ca4bb0 100644
--- a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 374.65702054300345,
-    "num_requests": 100,
-    "total_num_tokens": 75027,
-    "requests_per_second": 0.2669107864442698,
-    "tokens_per_second": 200.2551557455423
+    "elapsed_time": 422.7612150579989,
+    "num_requests": 200,
+    "total_num_tokens": 146278,
+    "requests_per_second": 0.47308029420949094,
+    "tokens_per_second": 346.0061963818796
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json
index 837981e..de26852 100644
--- a/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json
+++ b/benchmarks/benchmark_results/zai-org_GLM-4.7-Flash_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 560.7857336160014,
-    "num_requests": 100,
-    "total_num_tokens": 75027,
-    "requests_per_second": 0.17832122681721982,
-    "tokens_per_second": 133.7890668441555
+    "elapsed_time": 594.5536415039987,
+    "num_requests": 200,
+    "total_num_tokens": 146278,
+    "requests_per_second": 0.33638680522429343,
+    "tokens_per_second": 246.02994547299596
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
index 11dd77c..1a4060e 100644
--- a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 234.33056626700272,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.42674757114723666,
-    "tokens_per_second": 321.2769089381971
+    "elapsed_time": 395.26841144900027,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.5059852854591319,
+    "tokens_per_second": 371.4058491591393
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
index d967516..84a4499 100644
--- a/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 874.8529941339984,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.11430491827828541,
-    "tokens_per_second": 86.05445772580717
+    "elapsed_time": 769.1666062429999,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.260021688898978,
+    "tokens_per_second": 190.86242019407229
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
index 048c4b9..58e09ea 100644
--- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 438.29837328500435,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.2281550790401287,
-    "tokens_per_second": 171.7665512553609
+    "elapsed_time": 464.71097393700256,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.43037503139986644,
+    "tokens_per_second": 315.906032423287
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
index ec3251f..ca60feb 100644
--- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-4bit_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 621.1952276929987,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.16097998751758127,
-    "tokens_per_second": 121.19378360261106
+    "elapsed_time": 638.3282979609994,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.31331839844615444,
+    "tokens_per_second": 229.9835374194385
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
index b000cae..7b2b31b 100644
--- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 573.4174093670008,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.17439303091685104,
-    "tokens_per_second": 131.2917933257513
+    "elapsed_time": 502.6907218439992,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.3978589444944367,
+    "tokens_per_second": 292.0384117325289
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
index c8da1d6..d048712 100644
--- a/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/btbtyler09_Qwen3-Coder-30B-A3B-Instruct-gptq-8bit_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 838.6815635629973,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.11923476602390883,
-    "tokens_per_second": 89.76589360109976
+    "elapsed_time": 721.7994779089986,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.2770852655357769,
+    "tokens_per_second": 203.38751203489863
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
index 4186c55..89f11e0 100644
--- a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 746.2110970310023,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.13401033621434522,
-    "tokens_per_second": 100.8896816189698
+    "elapsed_time": 886.8526372269989,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.2255166096425645,
+    "tokens_per_second": 165.5348293928834
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
index 097b399..f5d2546 100644
--- a/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 1064.0833694909998,
-    "num_requests": 100,
-    "total_num_tokens": 75285,
-    "requests_per_second": 0.09397759881148657,
-    "tokens_per_second": 70.75103526522766
+    "elapsed_time": 1084.3601952080007,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.18444055848217136,
+    "tokens_per_second": 135.3839809398758
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json
index cee3264..0e86f1e 100644
--- a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 288.17777036500047,
-    "num_requests": 100,
-    "total_num_tokens": 76062,
-    "requests_per_second": 0.3470080286669645,
-    "tokens_per_second": 263.9412467646666
+    "elapsed_time": 369.2837602610016,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.5415889392445647,
+    "tokens_per_second": 403.09652364564084
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json
index 7e88a40..e96b629 100644
--- a/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/google_gemma-3-12b-it_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 422.3444380089968,
-    "num_requests": 100,
-    "total_num_tokens": 76062,
-    "requests_per_second": 0.23677356915463818,
-    "tokens_per_second": 180.0947121704009
+    "elapsed_time": 509.0738683320001,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.39287029337276264,
+    "tokens_per_second": 292.4074663029466
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
index 435fa83..efbb6ad 100644
--- a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 195.17220506099693,
-    "num_requests": 100,
-    "total_num_tokens": 74504,
-    "requests_per_second": 0.5123680391311207,
-    "tokens_per_second": 381.7346838742502
+    "elapsed_time": 224.76228898300178,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.8898289873490544,
+    "tokens_per_second": 649.02791593759
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
index 7efa523..4a572c5 100644
--- a/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 308.955628978998,
-    "num_requests": 100,
-    "total_num_tokens": 74504,
-    "requests_per_second": 0.32367107319089417,
-    "tokens_per_second": 241.1478963701438
+    "elapsed_time": 322.171811016,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.620786776376495,
+    "tokens_per_second": 452.7925628873698
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json
index ffd2b2b..0041800 100644
--- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 619.8506736600029,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.16132917854155862,
-    "tokens_per_second": 120.90976614975652
+    "elapsed_time": 1315.035868578001,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.15208710635115047,
+    "tokens_per_second": 111.8113988472388
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json
index df78dc3..e488eec 100644
--- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-120b_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 1103.3186353329984,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.09063564848591402,
-    "tokens_per_second": 67.92779311425312
+    "elapsed_time": 1923.4690410719995,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.10397879858182421,
+    "tokens_per_second": 76.44313314138553
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json
index 4c0bd36..ef3781e 100644
--- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_cluster_tp2_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 260.26568330699956,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.38422276317559617,
-    "tokens_per_second": 287.95959208958226
+    "elapsed_time": 246.0529060009976,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.8128333180474167,
+    "tokens_per_second": 597.5787987620997
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json
index 5e1de5c..2f47f62 100644
--- a/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json
+++ b/benchmarks/benchmark_results_rocm/openai_gpt-oss-20b_tp1_throughput.json
@@ -1,7 +1,7 @@
 {
-    "elapsed_time": 405.08143226500033,
-    "num_requests": 100,
-    "total_num_tokens": 74946,
-    "requests_per_second": 0.24686394397504988,
-    "tokens_per_second": 185.01465145154089
+    "elapsed_time": 333.59849170300004,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.5995230943012126,
+    "tokens_per_second": 440.75738846836555
 }
\ No newline at end of file
diff --git a/docs/results.json b/docs/results.json
index 876284c..e457025 100644
--- a/docs/results.json
+++ b/docs/results.json
@@ -12,7 +12,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 238.92735772921657
+      "tps_mean": 383.3285005130725
     },
     {
       "model": "google/gemma-3-12b-it",
@@ -26,7 +26,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 138.28298716107304
+      "tps_mean": 169.75141153501525
     },
     {
       "model": "Qwen/Qwen3-14B-AWQ",
@@ -40,7 +40,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 83.68710295019198
+      "tps_mean": 168.9724830093454
     },
     {
       "model": "openai/gpt-oss-20b",
@@ -54,7 +54,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 185.3668705592303
+      "tps_mean": 430.8537270233502
     },
     {
       "model": "openai/gpt-oss-120b",
@@ -68,7 +68,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 70.98811455236003
+      "tps_mean": 77.93077982357597
     },
     {
       "model": "zai-org/GLM-4.7-Flash",
@@ -82,7 +82,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 133.7890668441555
+      "tps_mean": 246.02994547299596
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
@@ -96,7 +96,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 117.75790760733192
+      "tps_mean": 227.90361622402403
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
@@ -110,7 +110,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 86.97761646092924
+      "tps_mean": 200.1426829468909
     },
     {
       "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
@@ -124,7 +124,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 71.52372211759099
+      "tps_mean": 132.2599488751683
     },
     {
       "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -138,7 +138,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 385.96636603292677
+      "tps_mean": 596.5891209659404
     },
     {
       "model": "google/gemma-3-12b-it",
@@ -152,7 +152,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 244.97127724647316
+      "tps_mean": 294.9472545848014
     },
     {
       "model": "Qwen/Qwen3-14B-AWQ",
@@ -166,7 +166,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 328.49937859629955
+      "tps_mean": 348.0799308087054
     },
     {
       "model": "openai/gpt-oss-20b",
@@ -180,7 +180,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 276.48063742763867
+      "tps_mean": 593.7843034224891
     },
     {
       "model": "openai/gpt-oss-120b",
@@ -194,7 +194,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 117.11822522607781
+      "tps_mean": 112.4781801162827
     },
     {
       "model": "zai-org/GLM-4.7-Flash",
@@ -208,7 +208,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 200.2551557455423
+      "tps_mean": 346.0061963818796
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
@@ -222,7 +222,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 170.28562676904787
+      "tps_mean": 320.69249844623016
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
@@ -236,7 +236,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 130.40765123003763
+      "tps_mean": 274.7000183491961
     },
     {
       "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
@@ -250,7 +250,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 98.44491320703105
+      "tps_mean": 167.00232766189475
     },
     {
       "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -264,7 +264,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 241.1478963701438
+      "tps_mean": 452.7925628873698
     },
     {
       "model": "google/gemma-3-12b-it",
@@ -278,7 +278,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 180.0947121704009
+      "tps_mean": 292.4074663029466
     },
     {
       "model": "Qwen/Qwen3-14B-AWQ",
@@ -292,7 +292,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 86.05445772580717
+      "tps_mean": 190.86242019407229
     },
     {
       "model": "openai/gpt-oss-20b",
@@ -306,7 +306,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 185.01465145154089
+      "tps_mean": 440.75738846836555
     },
     {
       "model": "openai/gpt-oss-120b",
@@ -320,7 +320,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 67.92779311425312
+      "tps_mean": 76.44313314138553
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
@@ -334,7 +334,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 121.19378360261106
+      "tps_mean": 229.9835374194385
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
@@ -348,7 +348,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 89.76589360109976
+      "tps_mean": 203.38751203489863
     },
     {
       "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
@@ -362,7 +362,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 1,
-      "tps_mean": 70.75103526522766
+      "tps_mean": 135.3839809398758
     },
     {
       "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -376,7 +376,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 381.7346838742502
+      "tps_mean": 649.02791593759
     },
     {
       "model": "google/gemma-3-12b-it",
@@ -390,7 +390,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 263.9412467646666
+      "tps_mean": 403.09652364564084
     },
     {
       "model": "Qwen/Qwen3-14B-AWQ",
@@ -404,7 +404,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 321.2769089381971
+      "tps_mean": 371.4058491591393
     },
     {
       "model": "openai/gpt-oss-20b",
@@ -418,7 +418,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 287.95959208958226
+      "tps_mean": 597.5787987620997
     },
     {
       "model": "openai/gpt-oss-120b",
@@ -432,7 +432,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 120.90976614975652
+      "tps_mean": 111.8113988472388
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
@@ -446,7 +446,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 171.7665512553609
+      "tps_mean": 315.906032423287
     },
     {
       "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
@@ -460,7 +460,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 131.2917933257513
+      "tps_mean": 292.0384117325289
     },
     {
       "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
@@ -474,7 +474,7 @@
       "error": false,
       "test": "Throughput",
       "tp": 2,
-      "tps_mean": 100.8896816189698
+      "tps_mean": 165.5348293928834
     }
   ]
 }
\ No newline at end of file