feat: Add script to automate README benchmark table generation and update max context benchmarks with new models and a kernel parameter change.

2026-02-02 22:32:12 +00:00
@@ -36,10 +36,12 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
 | **`meta-llama/Meta-Llama-3.1-8B-Instruct`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
 | **`google/gemma-3-12b-it`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
 | **`openai/gpt-oss-20b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
-| **`Qwen/Qwen3-14B-AWQ`** | 1 | 40k (0.90) | 40k (0.90) | 40k (0.90) | 40k (0.90) |
-| **`cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit`** | 1 | 256k (0.95) | 204k (0.90) | - | - |
-| **`dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16`** | 1 | 256k (0.90) | - | - | - |
+| **`Qwen/Qwen3-14B-AWQ`** | 1 | 40k (0.95) | 40k (0.95) | 40k (0.95) | 40k (0.95) |
+| **`btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
+| **`btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
+| **`dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
 | **`openai/gpt-oss-120b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
+| **`zai-org/GLM-4.7-Flash`** | 1 | 198k (0.95) | 198k (0.95) | 198k (0.95) | 198k (0.95) |


 ---
@@ -184,7 +186,7 @@ amd_iommu=pt amdgpu.gttsize=126976 ttm.pages_limit=32505856

 | Parameter                   | Purpose                                                                                    |
 |-----------------------------|--------------------------------------------------------------------------------------------|
-| `amd_iommu=pt`              | Sets IOMMU to pass-through mode; reduces DMA overhead for better performance               |
+| `amd_iommu=off`              | Disables AMD IOMMU to reduce overhead for better performance               |
 | `amdgpu.gttsize=126976`     | Caps GPU unified memory to 124 GiB; 126976 MiB ÷ 1024 = 124 GiB                            |
 | `ttm.pages_limit=32505856`  | Caps pinned memory to 124 GiB; 32505856 × 4 KiB = 126976 MiB = 124 GiB                     |

@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import json
+import math
+from pathlib import Path
+
+# Config
+RESULTS_FILE = Path(__file__).parent.parent / "benchmarks/max_context_results.json"
+
+ORDER = [
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "google/gemma-3-12b-it",
+    "openai/gpt-oss-20b",
+    "Qwen/Qwen3-14B-AWQ",
+    "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+    "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+    "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+    "openai/gpt-oss-120b",
+    "zai-org/GLM-4.7-Flash"
+]
+
+def format_tokens(n):
+    if n >= 1024:
+        return f"{int(n/1024)}k"
+    return str(n)
+
+def main():
+    if not RESULTS_FILE.exists():
+        print(f"Error: {RESULTS_FILE} not found.")
+        return
+
+    with open(RESULTS_FILE, "r") as f:
+        data = json.load(f)
+
+    # Organize data: model -> tp -> requests -> result
+    models = {}
+    
+    for entry in data:
+        if entry["status"] != "success":
+            continue
+            
+        model = entry["model"]
+        tp = entry["tp"]
+        seqs = entry["max_seqs"]
+        util = float(entry["util"])
+        ctx = entry["max_context_1_user"]
+        
+        if model not in models:
+            models[model] = {}
+        
+        if tp not in models[model]:
+            models[model][tp] = {}
+            
+        # Store tuple (ctx, util)
+        # If multiple entries for same seqs (e.g. diff utils), pick standard logic?
+        # The JSON usually has the best working one or we filter.
+        # Assuming unique best entry per seqs/tp tuple from the finder script behavior.
+        models[model][tp][seqs] = (ctx, util)
+
+    # Generate Table
+    print("| Model | TP | 1 Req | 4 Reqs | 8 Reqs | 16 Reqs |")
+    print("| :--- | :--- | :--- | :--- | :--- | :--- |")
+
+    for model_name in ORDER:
+        if model_name not in models:
+             # Identify if there's a different naming or just missing
+             continue
+        
+        tps = sorted(models[model_name].keys())
+        for tp in tps:
+            row = [f"**`{model_name}`**", str(tp)]
+            
+            for req in [1, 4, 8, 16]:
+                val = models[model_name][tp].get(req)
+                if val:
+                    ctx, util = val
+                    row.append(f"{format_tokens(ctx)} ({util:.2f})")
+                else:
+                    row.append("-")
+            
+            print("| " + " | ".join(row) + " |")
+
+if __name__ == "__main__":
+    main()